Skip to content

Commit

Permalink
Merge branch 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/…
Browse files Browse the repository at this point in the history
…kernel/git/nvdimm/nvdimm

Pull DAX updates from Dan Williams:
 "The completion of Jan's DAX work for 4.10.

  As I mentioned in the libnvdimm-for-4.10 pull request, these are some
  final fixes for the DAX dirty-cacheline-tracking invalidation work
  that was merged through the -mm, ext4, and xfs trees in -rc1. These
  patches were prepared prior to the merge window, but we waited for
  4.10-rc1 to have a stable merge base after all the prerequisites were
  merged.

  Quoting Jan on the overall changes in these patches:

     "So I'd like all these 6 patches to go for rc2. The first three
      patches fix invalidation of exceptional DAX entries (a bug which
      is there for a long time) - without these patches data loss can
      occur on power failure even though user called fsync(2). The other
      three patches change locking of DAX faults so that ->iomap_begin()
      is called in a more relaxed locking context and we are safe to
      start a transaction there for ext4"

  These have received a build success notification from the kbuild
  robot, and pass the latest libnvdimm unit tests. There have not been
  any -next releases since -rc1, so they have not appeared there"

* 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm:
  ext4: Simplify DAX fault path
  dax: Call ->iomap_begin without entry lock during dax fault
  dax: Finish fault completely when loading holes
  dax: Avoid page invalidation races and unnecessary radix tree traversals
  mm: Invalidate DAX radix tree entries only if appropriate
  ext2: Return BH_New buffers for zeroed blocks
  • Loading branch information
torvalds committed Jan 1, 2017
2 parents 238d1d0 + 1db1754 commit 4759d38
Show file tree
Hide file tree
Showing 5 changed files with 229 additions and 143 deletions.
243 changes: 154 additions & 89 deletions fs/dax.c
Original file line number Diff line number Diff line change
Expand Up @@ -451,33 +451,84 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
}

static int __dax_invalidate_mapping_entry(struct address_space *mapping,
pgoff_t index, bool trunc)
{
int ret = 0;
void *entry;
struct radix_tree_root *page_tree = &mapping->page_tree;

spin_lock_irq(&mapping->tree_lock);
entry = get_unlocked_mapping_entry(mapping, index, NULL);
if (!entry || !radix_tree_exceptional_entry(entry))
goto out;
if (!trunc &&
(radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
goto out;
radix_tree_delete(page_tree, index);
mapping->nrexceptional--;
ret = 1;
out:
put_unlocked_mapping_entry(mapping, index, entry);
spin_unlock_irq(&mapping->tree_lock);
return ret;
}
/*
* Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
* entry to get unlocked before deleting it.
*/
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
{
void *entry;
int ret = __dax_invalidate_mapping_entry(mapping, index, true);

spin_lock_irq(&mapping->tree_lock);
entry = get_unlocked_mapping_entry(mapping, index, NULL);
/*
* This gets called from truncate / punch_hole path. As such, the caller
* must hold locks protecting against concurrent modifications of the
* radix tree (usually fs-private i_mmap_sem for writing). Since the
* caller has seen exceptional entry for this index, we better find it
* at that index as well...
*/
if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
spin_unlock_irq(&mapping->tree_lock);
return 0;
}
radix_tree_delete(&mapping->page_tree, index);
WARN_ON_ONCE(!ret);
return ret;
}

/*
* Invalidate exceptional DAX entry if easily possible. This handles DAX
* entries for invalidate_inode_pages() so we evict the entry only if we can
* do so without blocking.
*/
int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
{
int ret = 0;
void *entry, **slot;
struct radix_tree_root *page_tree = &mapping->page_tree;

spin_lock_irq(&mapping->tree_lock);
entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
if (!entry || !radix_tree_exceptional_entry(entry) ||
slot_locked(mapping, slot))
goto out;
if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
goto out;
radix_tree_delete(page_tree, index);
mapping->nrexceptional--;
ret = 1;
out:
spin_unlock_irq(&mapping->tree_lock);
dax_wake_mapping_entry_waiter(mapping, index, entry, true);
if (ret)
dax_wake_mapping_entry_waiter(mapping, index, entry, true);
return ret;
}

return 1;
/*
* Invalidate exceptional DAX entry if it is clean.
*/
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index)
{
return __dax_invalidate_mapping_entry(mapping, index, false);
}

/*
Expand All @@ -488,24 +539,34 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
* otherwise it will simply fall out of the page cache under memory
* pressure without ever having been dirtied.
*/
static int dax_load_hole(struct address_space *mapping, void *entry,
static int dax_load_hole(struct address_space *mapping, void **entry,
struct vm_fault *vmf)
{
struct page *page;
int ret;

/* Hole page already exists? Return it... */
if (!radix_tree_exceptional_entry(entry)) {
vmf->page = entry;
return VM_FAULT_LOCKED;
if (!radix_tree_exceptional_entry(*entry)) {
page = *entry;
goto out;
}

/* This will replace locked radix tree entry with a hole page */
page = find_or_create_page(mapping, vmf->pgoff,
vmf->gfp_mask | __GFP_ZERO);
if (!page)
return VM_FAULT_OOM;
out:
vmf->page = page;
return VM_FAULT_LOCKED;
ret = finish_fault(vmf);
vmf->page = NULL;
*entry = page;
if (!ret) {
/* Grab reference for PTE that is now referencing the page */
get_page(page);
return VM_FAULT_NOPAGE;
}
return ret;
}

static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
Expand Down Expand Up @@ -934,6 +995,17 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
return -EIO;

/*
* Write can allocate block for an area which has a hole page mapped
* into page tables. We have to tear down these mappings so that data
* written by write(2) is visible in mmap.
*/
if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
(end - 1) >> PAGE_SHIFT);
}

while (pos < end) {
unsigned offset = pos & (PAGE_SIZE - 1);
struct blk_dax_ctl dax = { 0 };
Expand Down Expand Up @@ -992,23 +1064,6 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iov_iter_rw(iter) == WRITE)
flags |= IOMAP_WRITE;

/*
* Yes, even DAX files can have page cache attached to them: A zeroed
* page is inserted into the pagecache when we have to serve a write
* fault on a hole. It should never be dirtied and can simply be
* dropped from the pagecache once we get real data for the page.
*
* XXX: This is racy against mmap, and there's nothing we can do about
* it. We'll eventually need to shift this down even further so that
* we can check if we allocated blocks over a hole first.
*/
if (mapping->nrpages) {
ret = invalidate_inode_pages2_range(mapping,
pos >> PAGE_SHIFT,
(pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
WARN_ON_ONCE(ret);
}

while (iov_iter_count(iter)) {
ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
iter, dax_iomap_actor);
Expand All @@ -1023,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
}
EXPORT_SYMBOL_GPL(dax_iomap_rw);

static int dax_fault_return(int error)
{
if (error == 0)
return VM_FAULT_NOPAGE;
if (error == -ENOMEM)
return VM_FAULT_OOM;
return VM_FAULT_SIGBUS;
}

/**
* dax_iomap_fault - handle a page fault on a DAX file
* @vma: The virtual memory area where the fault occurred
Expand Down Expand Up @@ -1055,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (pos >= i_size_read(inode))
return VM_FAULT_SIGBUS;

entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
if (IS_ERR(entry)) {
error = PTR_ERR(entry);
goto out;
}

if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
flags |= IOMAP_WRITE;

Expand All @@ -1071,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
*/
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
if (error)
goto unlock_entry;
return dax_fault_return(error);
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
error = -EIO; /* fs corruption? */
vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
goto finish_iomap;
}

entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
if (IS_ERR(entry)) {
vmf_ret = dax_fault_return(PTR_ERR(entry));
goto finish_iomap;
}

Expand All @@ -1096,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}

if (error)
goto finish_iomap;
goto error_unlock_entry;

__SetPageUptodate(vmf->cow_page);
vmf_ret = finish_fault(vmf);
if (!vmf_ret)
vmf_ret = VM_FAULT_DONE_COW;
goto finish_iomap;
goto unlock_entry;
}

switch (iomap.type) {
Expand All @@ -1114,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
error = dax_insert_mapping(mapping, iomap.bdev, sector,
PAGE_SIZE, &entry, vma, vmf);
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error == -EBUSY)
error = 0;
break;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (!(vmf->flags & FAULT_FLAG_WRITE)) {
vmf_ret = dax_load_hole(mapping, entry, vmf);
break;
vmf_ret = dax_load_hole(mapping, &entry, vmf);
goto unlock_entry;
}
/*FALLTHRU*/
default:
Expand All @@ -1128,31 +1195,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
break;
}

error_unlock_entry:
vmf_ret = dax_fault_return(error) | major;
unlock_entry:
put_locked_mapping_entry(mapping, vmf->pgoff, entry);
finish_iomap:
if (ops->iomap_end) {
if (error || (vmf_ret & VM_FAULT_ERROR)) {
/* keep previous error */
ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
&iomap);
} else {
error = ops->iomap_end(inode, pos, PAGE_SIZE,
PAGE_SIZE, flags, &iomap);
}
}
unlock_entry:
if (vmf_ret != VM_FAULT_LOCKED || error)
put_locked_mapping_entry(mapping, vmf->pgoff, entry);
out:
if (error == -ENOMEM)
return VM_FAULT_OOM | major;
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error < 0 && error != -EBUSY)
return VM_FAULT_SIGBUS | major;
if (vmf_ret) {
WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
return vmf_ret;
int copied = PAGE_SIZE;

if (vmf_ret & VM_FAULT_ERROR)
copied = 0;
/*
* The fault is done by now and there's no way back (other
* thread may be already happily using PTE we have installed).
* Just ignore error from ->iomap_end since we cannot do much
* with it.
*/
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
return VM_FAULT_NOPAGE | major;
return vmf_ret;
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);

Expand Down Expand Up @@ -1276,16 +1337,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
goto fallback;

/*
* grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
* PMD or a HZP entry. If it can't (because a 4k page is already in
* the tree, for instance), it will return -EEXIST and we just fall
* back to 4k entries.
*/
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
if (IS_ERR(entry))
goto fallback;

/*
* Note that we don't use iomap_apply here. We aren't doing I/O, only
* setting up a mapping, so really we're using iomap_begin() as a way
Expand All @@ -1294,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
pos = (loff_t)pgoff << PAGE_SHIFT;
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
if (error)
goto unlock_entry;
goto fallback;

if (iomap.offset + iomap.length < pos + PMD_SIZE)
goto finish_iomap;

/*
* grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
* PMD or a HZP entry. If it can't (because a 4k page is already in
* the tree, for instance), it will return -EEXIST and we just fall
* back to 4k entries.
*/
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
if (IS_ERR(entry))
goto finish_iomap;

vmf.pgoff = pgoff;
vmf.flags = flags;
vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
Expand All @@ -1310,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (WARN_ON_ONCE(write))
goto finish_iomap;
goto unlock_entry;
result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
&entry);
break;
Expand All @@ -1319,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
break;
}

unlock_entry:
put_locked_mapping_entry(mapping, pgoff, entry);
finish_iomap:
if (ops->iomap_end) {
if (result == VM_FAULT_FALLBACK) {
ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,
&iomap);
} else {
error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE,
iomap_flags, &iomap);
if (error)
result = VM_FAULT_FALLBACK;
}
int copied = PMD_SIZE;

if (result == VM_FAULT_FALLBACK)
copied = 0;
/*
* The fault is done by now and there's no way back (other
* thread may be already happily using PMD we have installed).
* Just ignore error from ->iomap_end since we cannot do much
* with it.
*/
ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
&iomap);
}
unlock_entry:
put_locked_mapping_entry(mapping, pgoff, entry);
fallback:
if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, pmd, address);
Expand Down
Loading

0 comments on commit 4759d38

Please sign in to comment.