Skip to content

Commit

Permalink
ext4: add DAX functionality
Browse files Browse the repository at this point in the history
This is a port of the DAX functionality found in the current version of
ext2.

[[email protected]: heavily tweaked]
[[email protected]: remap_pages went away]
Signed-off-by: Ross Zwisler <[email protected]>
Reviewed-by: Andreas Dilger <[email protected]>
Signed-off-by: Matthew Wilcox <[email protected]>
Cc: Boaz Harrosh <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Dave Chinner <[email protected]>
Cc: Jan Kara <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Kirill A. Shutemov <[email protected]>
Cc: Mathieu Desnoyers <[email protected]>
Cc: Randy Dunlap <[email protected]>
Cc: Theodore Ts'o <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
Ross Zwisler authored and torvalds committed Feb 17, 2015
1 parent 25726bc commit 923ae0f
Show file tree
Hide file tree
Showing 8 changed files with 179 additions and 37 deletions.
1 change: 1 addition & 0 deletions Documentation/filesystems/dax.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ or a write()) work correctly.

These filesystems may be used for inspiration:
- ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt
- ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt


Shortcomings
Expand Down
4 changes: 4 additions & 0 deletions Documentation/filesystems/ext4.txt
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,10 @@ max_dir_size_kb=n This limits the size of directories so that any
i_version Enable 64-bit inode version support. This option is
off by default.

dax Use direct access (no page cache). See
Documentation/filesystems/dax.txt. Note that
this option is incompatible with data=journal.

Data Mode
=========
There are 3 different data modes:
Expand Down
6 changes: 6 additions & 0 deletions fs/ext4/ext4.h
Original file line number Diff line number Diff line change
Expand Up @@ -965,6 +965,11 @@ struct ext4_inode_info {
#define EXT4_MOUNT_ERRORS_MASK 0x00070
#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
#ifdef CONFIG_FS_DAX
#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */
#else
#define EXT4_MOUNT_DAX 0
#endif
#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
Expand Down Expand Up @@ -2578,6 +2583,7 @@ extern const struct file_operations ext4_dir_operations;
/* file.c */
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
extern const struct file_operations ext4_dax_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);

/* inline.c */
Expand Down
49 changes: 47 additions & 2 deletions fs/ext4/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(iocb->ki_filp);
struct mutex *aio_mutex = NULL;
struct blk_plug plug;
int o_direct = file->f_flags & O_DIRECT;
int o_direct = io_is_direct(file);
int overwrite = 0;
size_t length = iov_iter_count(from);
ssize_t ret;
Expand Down Expand Up @@ -191,6 +191,26 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
return ret;
}

#ifdef CONFIG_FS_DAX
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_fault(vma, vmf, ext4_get_block);
/* Is this the right get_block? */
}

static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_mkwrite(vma, vmf, ext4_get_block);
}

static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
.page_mkwrite = ext4_dax_mkwrite,
};
#else
#define ext4_dax_vm_ops ext4_file_vm_ops
#endif

static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
Expand All @@ -200,7 +220,12 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
file_accessed(file);
vma->vm_ops = &ext4_file_vm_ops;
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
vma->vm_flags |= VM_MIXEDMAP;
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
return 0;
}

Expand Down Expand Up @@ -599,6 +624,26 @@ const struct file_operations ext4_file_operations = {
.fallocate = ext4_fallocate,
};

#ifdef CONFIG_FS_DAX
const struct file_operations ext4_dax_file_operations = {
.llseek = ext4_llseek,
.read = new_sync_read,
.write = new_sync_write,
.read_iter = generic_file_read_iter,
.write_iter = ext4_file_write_iter,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
.mmap = ext4_file_mmap,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
/* Splice not yet supported with DAX */
.fallocate = ext4_fallocate,
};
#endif

const struct inode_operations ext4_file_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
Expand Down
18 changes: 13 additions & 5 deletions fs/ext4/indirect.c
Original file line number Diff line number Diff line change
Expand Up @@ -689,14 +689,22 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
inode_dio_done(inode);
goto locked;
}
ret = __blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iter, offset,
ext4_get_block, NULL, NULL, 0);
if (IS_DAX(inode))
ret = dax_do_io(rw, iocb, inode, iter, offset,
ext4_get_block, NULL, 0);
else
ret = __blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iter, offset,
ext4_get_block, NULL, NULL, 0);
inode_dio_done(inode);
} else {
locked:
ret = blockdev_direct_IO(rw, iocb, inode, iter,
offset, ext4_get_block);
if (IS_DAX(inode))
ret = dax_do_io(rw, iocb, inode, iter, offset,
ext4_get_block, NULL, DIO_LOCKING);
else
ret = blockdev_direct_IO(rw, iocb, inode, iter,
offset, ext4_get_block);

if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
Expand Down
89 changes: 62 additions & 27 deletions fs/ext4/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -657,6 +657,18 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
return retval;
}

static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
{
struct inode *inode = bh->b_assoc_map->host;
/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
int err;
if (!uptodate)
return;
WARN_ON(!buffer_unwritten(bh));
err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
}

/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096

Expand Down Expand Up @@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,

map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
bh->b_assoc_map = inode->i_mapping;
bh->b_private = (void *)(unsigned long)iblock;
bh->b_end_io = ext4_end_io_unwritten;
}
if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
set_buffer_defer_completion(bh);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
Expand Down Expand Up @@ -3010,13 +3027,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
get_block_func = ext4_get_block_write;
dio_flags = DIO_LOCKING;
}
ret = __blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iter,
offset,
get_block_func,
ext4_end_io_dio,
NULL,
dio_flags);
if (IS_DAX(inode))
ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func,
ext4_end_io_dio, dio_flags);
else
ret = __blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iter, offset,
get_block_func,
ext4_end_io_dio, NULL, dio_flags);

/*
* Put our reference to io_end. This can free the io_end structure e.g.
Expand Down Expand Up @@ -3180,19 +3198,12 @@ void ext4_set_aops(struct inode *inode)
inode->i_mapping->a_ops = &ext4_aops;
}

/*
* ext4_block_zero_page_range() zeros out a mapping of length 'length'
* starting from file offset 'from'. The range to be zero'd must
* be contained with in one block. If the specified range exceeds
* the end of the block it will be shortened to end of the block
* that cooresponds to 'from'
*/
static int ext4_block_zero_page_range(handle_t *handle,
static int __ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
unsigned blocksize, max, pos;
unsigned blocksize, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
Expand All @@ -3205,14 +3216,6 @@ static int ext4_block_zero_page_range(handle_t *handle,
return -ENOMEM;

blocksize = inode->i_sb->s_blocksize;
max = blocksize - (offset & (blocksize - 1));

/*
* correct length if it does not fall between
* 'from' and the end of the block
*/
if (length > max || length < 0)
length = max;

iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);

Expand Down Expand Up @@ -3277,6 +3280,33 @@ static int ext4_block_zero_page_range(handle_t *handle,
return err;
}

/*
* ext4_block_zero_page_range() zeros out a mapping of length 'length'
* starting from file offset 'from'. The range to be zero'd must
* be contained with in one block. If the specified range exceeds
* the end of the block it will be shortened to end of the block
* that cooresponds to 'from'
*/
static int ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
struct inode *inode = mapping->host;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
unsigned blocksize = inode->i_sb->s_blocksize;
unsigned max = blocksize - (offset & (blocksize - 1));

/*
* correct length if it does not fall between
* 'from' and the end of the block
*/
if (length > max || length < 0)
length = max;

if (IS_DAX(inode))
return dax_zero_page_range(inode, from, length, ext4_get_block);
return __ext4_block_zero_page_range(handle, mapping, from, length);
}

/*
* ext4_block_truncate_page() zeroes out a mapping from file offset `from'
* up to the end of the block which corresponds to `from'.
Expand Down Expand Up @@ -3798,8 +3828,10 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
if (test_opt(inode->i_sb, DAX))
new_fl |= S_DAX;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
}

/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
Expand Down Expand Up @@ -4052,7 +4084,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)

if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
if (test_opt(inode->i_sb, DAX))
inode->i_fop = &ext4_dax_file_operations;
else
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &ext4_dir_inode_operations;
Expand Down Expand Up @@ -4534,7 +4569,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
* Truncate pagecache after we've waited for commit
* in data=journal mode to make pages freeable.
*/
truncate_pagecache(inode, inode->i_size);
truncate_pagecache(inode, inode->i_size);
}
/*
* We want to call ext4_truncate() even if attr->ia_size ==
Expand Down
10 changes: 8 additions & 2 deletions fs/ext4/namei.c
Original file line number Diff line number Diff line change
Expand Up @@ -2235,7 +2235,10 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
if (test_opt(inode->i_sb, DAX))
inode->i_fop = &ext4_dax_file_operations;
else
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
err = ext4_add_nondir(handle, dentry, inode);
if (!err && IS_DIRSYNC(dir))
Expand Down Expand Up @@ -2299,7 +2302,10 @@ static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
if (test_opt(inode->i_sb, DAX))
inode->i_fop = &ext4_dax_file_operations;
else
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
d_tmpfile(dentry, inode);
err = ext4_orphan_add(handle, inode);
Expand Down
Loading

0 comments on commit 923ae0f

Please sign in to comment.