Skip to content

Commit

Permalink
ext4: Support for synchronous DAX faults
Browse files Browse the repository at this point in the history
We return IOMAP_F_DIRTY flag from ext4_iomap_begin() when asked to
prepare blocks for writing and the inode has some uncommitted metadata
changes. In the fault handler ext4_dax_fault() we then detect this case
(through VM_FAULT_NEEDDSYNC return value) and call helper
dax_finish_sync_fault() to flush metadata changes and insert page table
entry. Note that this will also dirty corresponding radix tree entry
which is what we want - fsync(2) will still provide data integrity
guarantees for applications not using userspace flushing. And
applications using userspace flushing can avoid calling fsync(2) and
thus avoid the performance overhead.

Reviewed-by: Ross Zwisler <[email protected]>
Signed-off-by: Jan Kara <[email protected]>
Signed-off-by: Dan Williams <[email protected]>
  • Loading branch information
jankara authored and djbw committed Nov 3, 2017
1 parent 497f692 commit b8a6176
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 1 deletion.
15 changes: 14 additions & 1 deletion fs/ext4/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include <linux/uio.h>
#include <linux/mman.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
Expand Down Expand Up @@ -295,6 +296,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
*/
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
(vmf->vma->vm_flags & VM_SHARED);
pfn_t pfn;

if (write) {
sb_start_pagefault(sb);
Expand All @@ -310,9 +312,12 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
} else {
down_read(&EXT4_I(inode)->i_mmap_sem);
}
result = dax_iomap_fault(vmf, pe_size, NULL, &ext4_iomap_ops);
result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops);
if (write) {
ext4_journal_stop(handle);
/* Handling synchronous page fault? */
if (result & VM_FAULT_NEEDDSYNC)
result = dax_finish_sync_fault(vmf, pe_size, pfn);
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
} else {
Expand Down Expand Up @@ -350,6 +355,13 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;

/*
* We don't support synchronous mappings for non-DAX files. At least
* until someone comes with a sensible use case.
*/
if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
return -EOPNOTSUPP;

file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
Expand Down Expand Up @@ -719,6 +731,7 @@ const struct file_operations ext4_file_operations = {
.compat_ioctl = ext4_compat_ioctl,
#endif
.mmap = ext4_file_mmap,
.mmap_supported_flags = MAP_SYNC,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
Expand Down
15 changes: 15 additions & 0 deletions fs/ext4/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -3394,6 +3394,19 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
}

#ifdef CONFIG_FS_DAX
static bool ext4_inode_datasync_dirty(struct inode *inode)
{
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;

if (journal)
return !jbd2_transaction_committed(journal,
EXT4_I(inode)->i_datasync_tid);
/* Any metadata buffers to write? */
if (!list_empty(&inode->i_mapping->private_list))
return true;
return inode->i_state & I_DIRTY_DATASYNC;
}

static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap)
{
Expand Down Expand Up @@ -3466,6 +3479,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
}

iomap->flags = 0;
if ((flags & IOMAP_WRITE) && ext4_inode_datasync_dirty(inode))
iomap->flags |= IOMAP_F_DIRTY;
iomap->bdev = inode->i_sb->s_bdev;
iomap->dax_dev = sbi->s_daxdev;
iomap->offset = first_block << blkbits;
Expand Down
17 changes: 17 additions & 0 deletions fs/jbd2/journal.c
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,23 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
return err;
}

/* Return 1 when transaction with given tid has already committed. */
int jbd2_transaction_committed(journal_t *journal, tid_t tid)
{
int ret = 1;

read_lock(&journal->j_state_lock);
if (journal->j_running_transaction &&
journal->j_running_transaction->t_tid == tid)
ret = 0;
if (journal->j_committing_transaction &&
journal->j_committing_transaction->t_tid == tid)
ret = 0;
read_unlock(&journal->j_state_lock);
return ret;
}
EXPORT_SYMBOL(jbd2_transaction_committed);

/*
* When this function returns the transaction corresponding to tid
* will be completed. If the transaction has currently running, start
Expand Down
1 change: 1 addition & 0 deletions include/linux/jbd2.h
Original file line number Diff line number Diff line change
Expand Up @@ -1367,6 +1367,7 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid);
int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
int jbd2_transaction_committed(journal_t *journal, tid_t tid);
int jbd2_complete_transaction(journal_t *journal, tid_t tid);
int jbd2_log_do_checkpoint(journal_t *journal);
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
Expand Down

0 comments on commit b8a6176

Please sign in to comment.