Skip to content

Commit

Permalink
Merge branch 'for-4.18/dax' into libnvdimm-for-next
Browse files Browse the repository at this point in the history
  • Loading branch information
djbw committed Jun 8, 2018
2 parents 808c340 + cc4a90a commit b568457
Show file tree
Hide file tree
Showing 21 changed files with 545 additions and 295 deletions.
14 changes: 11 additions & 3 deletions drivers/dax/super.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
{
struct block_device *bdev = sb->s_bdev;
struct dax_device *dax_dev;
bool dax_enabled = false;
pgoff_t pgoff;
int err, id;
void *kaddr;
Expand Down Expand Up @@ -134,14 +135,21 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
* on being able to do (page_address(pfn_to_page())).
*/
WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API));
dax_enabled = true;
} else if (pfn_t_devmap(pfn)) {
/* pass */;
} else {
struct dev_pagemap *pgmap;

pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL);
if (pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX)
dax_enabled = true;
put_dev_pagemap(pgmap);
}

if (!dax_enabled) {
pr_debug("VFS (%s): error: dax support not enabled\n",
sb->s_id);
return -EOPNOTSUPP;
}

return 0;
}
EXPORT_SYMBOL_GPL(__bdev_dax_supported);
Expand Down
2 changes: 0 additions & 2 deletions drivers/nvdimm/pfn_devs.c
Original file line number Diff line number Diff line change
Expand Up @@ -561,8 +561,6 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
res->start += start_pad;
res->end -= end_trunc;

pgmap->type = MEMORY_DEVICE_HOST;

if (nd_pfn->mode == PFN_MODE_RAM) {
if (offset < SZ_8K)
return -EINVAL;
Expand Down
25 changes: 25 additions & 0 deletions drivers/nvdimm/pmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,27 @@ static void pmem_release_disk(void *__pmem)
put_disk(pmem->disk);
}

static void pmem_release_pgmap_ops(void *__pgmap)
{
dev_pagemap_put_ops();
}

static void fsdax_pagefree(struct page *page, void *data)
{
wake_up_var(&page->_refcount);
}

static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
{
dev_pagemap_get_ops();
if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
return -ENOMEM;
pgmap->type = MEMORY_DEVICE_FS_DAX;
pgmap->page_free = fsdax_pagefree;

return 0;
}

static int pmem_attach_disk(struct device *dev,
struct nd_namespace_common *ndns)
{
Expand Down Expand Up @@ -347,6 +368,8 @@ static int pmem_attach_disk(struct device *dev,
pmem->pfn_flags = PFN_DEV;
pmem->pgmap.ref = &q->q_usage_counter;
if (is_nd_pfn(dev)) {
if (setup_pagemap_fsdax(dev, &pmem->pgmap))
return -ENOMEM;
addr = devm_memremap_pages(dev, &pmem->pgmap);
pfn_sb = nd_pfn->pfn_sb;
pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
Expand All @@ -358,6 +381,8 @@ static int pmem_attach_disk(struct device *dev,
} else if (pmem_should_map_pages(dev)) {
memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
pmem->pgmap.altmap_valid = false;
if (setup_pagemap_fsdax(dev, &pmem->pgmap))
return -ENOMEM;
addr = devm_memremap_pages(dev, &pmem->pgmap);
pmem->pfn_flags |= PFN_MAP;
memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
Expand Down
1 change: 1 addition & 0 deletions fs/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ config FS_DAX
bool "Direct Access (DAX) support"
depends on MMU
depends on !(ARM || MIPS || SPARC)
select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED)
select FS_IOMAP
select DAX
help
Expand Down
115 changes: 99 additions & 16 deletions fs/dax.c
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
}
}

static struct page *dax_busy_page(void *entry)
{
unsigned long pfn;

for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);

if (page_ref_count(page) > 1)
return page;
}
return NULL;
}

/*
* Find radix tree entry at given index. If it points to an exceptional entry,
* return it with the radix tree entry locked. If the radix tree doesn't
Expand Down Expand Up @@ -492,6 +505,90 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
return entry;
}

/**
* dax_layout_busy_page - find first pinned page in @mapping
* @mapping: address space to scan for a page with ref count > 1
*
* DAX requires ZONE_DEVICE mapped pages. These pages are never
* 'onlined' to the page allocator so they are considered idle when
* page->count == 1. A filesystem uses this interface to determine if
* any page in the mapping is busy, i.e. for DMA, or other
* get_user_pages() usages.
*
* It is expected that the filesystem is holding locks to block the
* establishment of new mappings in this address_space. I.e. it expects
* to be able to run unmap_mapping_range() and subsequently not race
* mapping_mapped() becoming true.
*/
struct page *dax_layout_busy_page(struct address_space *mapping)
{
pgoff_t indices[PAGEVEC_SIZE];
struct page *page = NULL;
struct pagevec pvec;
pgoff_t index, end;
unsigned i;

/*
* In the 'limited' case get_user_pages() for dax is disabled.
*/
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
return NULL;

if (!dax_mapping(mapping) || !mapping_mapped(mapping))
return NULL;

pagevec_init(&pvec);
index = 0;
end = -1;

/*
* If we race get_user_pages_fast() here either we'll see the
* elevated page count in the pagevec_lookup and wait, or
* get_user_pages_fast() will see that the page it took a reference
* against is no longer mapped in the page tables and bail to the
* get_user_pages() slow path. The slow path is protected by
* pte_lock() and pmd_lock(). New references are not taken without
* holding those locks, and unmap_mapping_range() will not zero the
* pte or pmd without holding the respective lock, so we are
* guaranteed to either see new references or prevent new
* references from being established.
*/
unmap_mapping_range(mapping, 0, 0, 1);

while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
min(end - index, (pgoff_t)PAGEVEC_SIZE),
indices)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *pvec_ent = pvec.pages[i];
void *entry;

index = indices[i];
if (index >= end)
break;

if (!radix_tree_exceptional_entry(pvec_ent))
continue;

xa_lock_irq(&mapping->i_pages);
entry = get_unlocked_mapping_entry(mapping, index, NULL);
if (entry)
page = dax_busy_page(entry);
put_unlocked_mapping_entry(mapping, index, entry);
xa_unlock_irq(&mapping->i_pages);
if (page)
break;
}
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
index++;

if (page)
break;
}
return page;
}
EXPORT_SYMBOL_GPL(dax_layout_busy_page);

static int __dax_invalidate_mapping_entry(struct address_space *mapping,
pgoff_t index, bool trunc)
{
Expand Down Expand Up @@ -912,7 +1009,6 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
unsigned long vaddr = vmf->address;
int ret = VM_FAULT_NOPAGE;
struct page *zero_page;
void *entry2;
pfn_t pfn;

zero_page = ZERO_PAGE(0);
Expand All @@ -922,13 +1018,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
}

pfn = page_to_pfn_t(zero_page);
entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
RADIX_DAX_ZERO_PAGE, false);
if (IS_ERR(entry2)) {
ret = VM_FAULT_SIGBUS;
goto out;
}

dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
false);
vm_insert_mixed(vmf->vma, vaddr, pfn);
out:
trace_dax_load_hole(inode, vmf, ret);
Expand Down Expand Up @@ -1240,10 +1331,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,

entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
0, write && !sync);
if (IS_ERR(entry)) {
error = PTR_ERR(entry);
goto error_finish_iomap;
}

/*
* If we are doing synchronous page fault and inode needs fsync,
Expand Down Expand Up @@ -1327,8 +1414,6 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
pfn = page_to_pfn_t(zero_page);
ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
if (IS_ERR(ret))
goto fallback;

ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (!pmd_none(*(vmf->pmd))) {
Expand Down Expand Up @@ -1450,8 +1535,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,

entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
RADIX_DAX_PMD, write && !sync);
if (IS_ERR(entry))
goto finish_iomap;

/*
* If we are doing synchronous page fault and inode needs fsync,
Expand Down
72 changes: 66 additions & 6 deletions fs/xfs/xfs_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ xfs_file_aio_write_checks(
if (error <= 0)
return error;

error = xfs_break_layouts(inode, iolock);
error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
if (error)
return error;

Expand Down Expand Up @@ -718,6 +718,69 @@ xfs_file_write_iter(
return ret;
}

static void
xfs_wait_dax_page(
struct inode *inode,
bool *did_unlock)
{
struct xfs_inode *ip = XFS_I(inode);

*did_unlock = true;
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
schedule();
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
}

static int
xfs_break_dax_layouts(
struct inode *inode,
uint iolock,
bool *did_unlock)
{
struct page *page;

ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));

page = dax_layout_busy_page(inode->i_mapping);
if (!page)
return 0;

return ___wait_var_event(&page->_refcount,
atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
0, 0, xfs_wait_dax_page(inode, did_unlock));
}

int
xfs_break_layouts(
struct inode *inode,
uint *iolock,
enum layout_break_reason reason)
{
bool retry;
int error;

ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));

do {
retry = false;
switch (reason) {
case BREAK_UNMAP:
error = xfs_break_dax_layouts(inode, *iolock, &retry);
if (error || retry)
break;
/* fall through */
case BREAK_WRITE:
error = xfs_break_leased_layouts(inode, iolock, &retry);
break;
default:
WARN_ON_ONCE(1);
error = -EINVAL;
}
} while (error == 0 && retry);

return error;
}

#define XFS_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
Expand All @@ -734,7 +797,7 @@ xfs_file_fallocate(
struct xfs_inode *ip = XFS_I(inode);
long error;
enum xfs_prealloc_flags flags = 0;
uint iolock = XFS_IOLOCK_EXCL;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
loff_t new_size = 0;
bool do_file_insert = false;

Expand All @@ -744,13 +807,10 @@ xfs_file_fallocate(
return -EOPNOTSUPP;

xfs_ilock(ip, iolock);
error = xfs_break_layouts(inode, &iolock);
error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
if (error)
goto out_unlock;

xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
iolock |= XFS_MMAPLOCK_EXCL;

if (mode & FALLOC_FL_PUNCH_HOLE) {
error = xfs_free_file_space(ip, offset, len);
if (error)
Expand Down
16 changes: 16 additions & 0 deletions fs/xfs/xfs_inode.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,20 @@ static inline void xfs_ifunlock(struct xfs_inode *ip)
#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) \
>> XFS_ILOCK_SHIFT)

/*
* Layouts are broken in the BREAK_WRITE case to ensure that
* layout-holders do not collide with local writes. Additionally,
* layouts are broken in the BREAK_UNMAP case to make sure the
* layout-holder has a consistent view of the file's extent map. While
* BREAK_WRITE breaks can be satisfied by recalling FL_LAYOUT leases,
* BREAK_UNMAP breaks additionally require waiting for busy dax-pages to
* go idle.
*/
enum layout_break_reason {
BREAK_WRITE,
BREAK_UNMAP,
};

/*
* For multiple groups support: if S_ISGID bit is set in the parent
* directory, group of new file is set to that of the parent, and
Expand Down Expand Up @@ -443,6 +457,8 @@ enum xfs_prealloc_flags {

int xfs_update_prealloc_flags(struct xfs_inode *ip,
enum xfs_prealloc_flags flags);
int xfs_break_layouts(struct inode *inode, uint *iolock,
enum layout_break_reason reason);

/* from xfs_iops.c */
extern void xfs_setup_inode(struct xfs_inode *ip);
Expand Down
Loading

0 comments on commit b568457

Please sign in to comment.