diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 68b95ad82126ed..575636f6491ef6 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -14,7 +14,6 @@ config BTRFS_FS select LZO_DECOMPRESS select ZSTD_COMPRESS select ZSTD_DECOMPRESS - select FS_IOMAP select RAID6_PQ select XOR_BLOCKS select SRCU diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 47b8874eddff6c..30ce7039bc273d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2933,7 +2933,6 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; -ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index dff89d04fd1656..2c14312b05e8ee 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1819,7 +1819,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) loff_t endbyte; int err; - written = btrfs_direct_IO(iocb, from); + written = generic_file_direct_write(iocb, from); if (written < 0 || !iov_iter_count(from)) return written; @@ -3476,26 +3476,9 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) return generic_file_open(inode, filp); } -static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ - ssize_t ret = 0; - - if (iocb->ki_flags & IOCB_DIRECT) { - struct inode *inode = file_inode(iocb->ki_filp); - - inode_lock_shared(inode); - ret = btrfs_direct_IO(iocb, to); - inode_unlock_shared(inode); - if (ret < 0) - return ret; - } - - return generic_file_buffered_read(iocb, to, ret); -} - const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, - .read_iter = btrfs_file_read_iter, + .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, .write_iter = btrfs_file_write_iter, .mmap = btrfs_file_mmap, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2b40fcfababc16..12b5d61f23bb1b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -29,7 +30,6 @@ #include #include #include -#include #include #include "misc.h" #include "ctree.h" @@ -58,9 +58,9 @@ struct btrfs_iget_args { struct btrfs_dio_data { u64 reserve; - loff_t length; - ssize_t submitted; - struct extent_changeset *data_reserved; + u64 unsubmitted_oe_range_start; + u64 unsubmitted_oe_range_end; + int overwrite; }; static const struct inode_operations btrfs_dir_inode_operations; @@ -7045,7 +7045,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, } static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, bool writing) + struct extent_state **cached_state, int writing) { struct btrfs_ordered_extent *ordered; int ret = 0; @@ -7183,7 +7183,30 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, } +static int btrfs_get_blocks_direct_read(struct extent_map *em, + struct buffer_head *bh_result, + struct inode *inode, + u64 start, u64 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + + if (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + return -ENOENT; + + len = min(len, em->len - (start - em->start)); + + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> + inode->i_blkbits; + bh_result->b_size = len; + bh_result->b_bdev = fs_info->fs_devices->latest_bdev; + set_buffer_mapped(bh_result); + + return 0; +} + static int btrfs_get_blocks_direct_write(struct extent_map **map, + struct buffer_head *bh_result, struct inode *inode, struct btrfs_dio_data *dio_data, u64 start, u64 len) @@ -7245,6 +7268,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, } /* this will cow the extent */ + len = bh_result->b_size; free_extent_map(em); *map = em = btrfs_new_extent_direct(inode, start, len); if (IS_ERR(em)) { @@ -7255,73 +7279,64 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, len = min(len, em->len - (start - em->start)); skip_cow: + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> + inode->i_blkbits; + bh_result->b_size = len; + bh_result->b_bdev = fs_info->fs_devices->latest_bdev; + set_buffer_mapped(bh_result); + + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + set_buffer_new(bh_result); + /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ - if (start + len > i_size_read(inode)) + if (!dio_data->overwrite && start + len > i_size_read(inode)) i_size_write(inode, start + len); + WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; + dio_data->unsubmitted_oe_range_end = start + len; + current->journal_info = dio_data; out: return ret; } -static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, - loff_t length, unsigned flags, struct iomap *iomap, - struct iomap *srcmap) +static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct extent_state *cached_state = NULL; struct btrfs_dio_data *dio_data = NULL; + u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; - const bool write = !!(flags & IOMAP_WRITE); + u64 len = bh_result->b_size; int ret = 0; - u64 len = length; - bool unlock_extents = false; - if (!write) + if (!create) len = min_t(u64, len, fs_info->sectorsize); lockstart = start; lockend = start + len - 1; - /* - * The generic stuff only does filemap_write_and_wait_range, which - * isn't enough if we've written compressed pages to this area, so we - * need to flush the dirty pages again to make absolutely sure that any - * outstanding dirty pages are on disk. - */ - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - ret = filemap_fdatawrite_range(inode->i_mapping, start, - start + length - 1); - - dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); - if (!dio_data) - return -ENOMEM; - - dio_data->length = length; - if (write) { - dio_data->reserve = round_up(length, fs_info->sectorsize); - ret = btrfs_delalloc_reserve_space(inode, - &dio_data->data_reserved, - start, dio_data->reserve); - if (ret) { - extent_changeset_free(dio_data->data_reserved); - kfree(dio_data); - return ret; - } + if (current->journal_info) { + /* + * Need to pull our outstanding extents and set journal_info to NULL so + * that anything that needs to check if there's a transaction doesn't get + * confused. + */ + dio_data = current->journal_info; + current->journal_info = NULL; } - iomap->private = dio_data; - /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. */ - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { + if (lock_extent_direct(inode, lockstart, lockend, &cached_state, + create)) { ret = -ENOTBLK; goto err; } @@ -7353,47 +7368,35 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, goto unlock_err; } - len = min(len, em->len - (start - em->start)); - if (write) { - ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, - start, len); + if (create) { + ret = btrfs_get_blocks_direct_write(&em, bh_result, inode, + dio_data, start, len); if (ret < 0) goto unlock_err; - unlock_extents = true; - /* Recalc len in case the new em is smaller than requested */ - len = min(len, em->len - (start - em->start)); + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); } else { + ret = btrfs_get_blocks_direct_read(em, bh_result, inode, + start, len); + /* Can be negative only if we read from a hole */ + if (ret < 0) { + ret = 0; + free_extent_map(em); + goto unlock_err; + } /* * We need to unlock only the end area that we aren't using. * The rest is going to be unlocked by the endio routine. */ - lockstart = start + len; - if (lockstart < lockend) - unlock_extents = true; - } - - if (unlock_extents) - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state); - else - free_extent_state(cached_state); - - /* - * Translate extent map information to iomap. - * We trim the extents (and move the addr) even though iomap code does - * that, since we have locked only the parts we are performing I/O in. - */ - if ((em->block_start == EXTENT_MAP_HOLE) || - (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { - iomap->addr = IOMAP_NULL_ADDR; - iomap->type = IOMAP_HOLE; - } else { - iomap->addr = em->block_start + (start - em->start); - iomap->type = IOMAP_MAPPED; + lockstart = start + bh_result->b_size; + if (lockstart < lockend) { + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state); + } else { + free_extent_state(cached_state); + } } - iomap->offset = start; - iomap->bdev = fs_info->fs_devices->latest_bdev; - iomap->length = len; free_extent_map(em); @@ -7403,53 +7406,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: - if (dio_data) { - btrfs_delalloc_release_space(inode, dio_data->data_reserved, - start, dio_data->reserve, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); - extent_changeset_free(dio_data->data_reserved); - kfree(dio_data); - } - return ret; -} - -static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, - ssize_t written, unsigned flags, struct iomap *iomap) -{ - int ret = 0; - struct btrfs_dio_data *dio_data = iomap->private; - size_t submitted = dio_data->submitted; - const bool write = !!(flags & IOMAP_WRITE); - - if (!write && (iomap->type == IOMAP_HOLE)) { - /* If reading from a hole, unlock and return */ - unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); - goto out; - } - - if (submitted < length) { - pos += submitted; - length -= submitted; - if (write) - __endio_write_update_ordered(inode, pos, length, false); - else - unlock_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1); - ret = -ENOTBLK; - } - - if (write) { - if (dio_data->reserve) - btrfs_delalloc_release_space(inode, - dio_data->data_reserved, pos, - dio_data->reserve, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); - extent_changeset_free(dio_data->data_reserved); - } -out: - kfree(dio_data); - iomap->private = NULL; - + if (dio_data) + current->journal_info = dio_data; return ret; } @@ -7472,7 +7430,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) dip->logical_offset + dip->bytes - 1); } - bio_endio(dip->dio_bio); + dio_end_io(dip->dio_bio); kfree(dip); } @@ -7708,11 +7666,24 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; dip->dio_bio = dio_bio; refcount_set(&dip->refs, 1); + + if (write) { + struct btrfs_dio_data *dio_data = current->journal_info; + + /* + * Setting range start and end to the same value means that + * no cleanup will happen in btrfs_direct_IO + */ + dio_data->unsubmitted_oe_range_end = dip->logical_offset + + dip->bytes; + dio_data->unsubmitted_oe_range_start = + dio_data->unsubmitted_oe_range_end; + } return dip; } -static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, - struct bio *dio_bio, loff_t file_offset) +static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, + loff_t file_offset) { const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); @@ -7729,7 +7700,6 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, int ret; blk_status_t status; struct btrfs_io_geometry geom; - struct btrfs_dio_data *dio_data = iomap->private; dip = btrfs_create_dio_private(dio_bio, inode, file_offset); if (!dip) { @@ -7738,8 +7708,8 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, file_offset + dio_bio->bi_iter.bi_size - 1); } dio_bio->bi_status = BLK_STS_RESOURCE; - bio_endio(dio_bio); - return BLK_QC_T_NONE; + dio_end_io(dio_bio); + return; } if (!write && csum) { @@ -7810,17 +7780,15 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, goto out_err; } - dio_data->submitted += clone_len; clone_offset += clone_len; start_sector += clone_len >> 9; file_offset += clone_len; } while (submit_len > 0); - return BLK_QC_T_NONE; + return; out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); - return BLK_QC_T_NONE; } static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, @@ -7856,30 +7824,37 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, return retval; } -static const struct iomap_ops btrfs_dio_iomap_ops = { - .iomap_begin = btrfs_dio_iomap_begin, - .iomap_end = btrfs_dio_iomap_end, -}; - -static const struct iomap_dio_ops btrfs_dops = { - .submit_io = btrfs_submit_direct, -}; - -ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_dio_data dio_data = { 0 }; struct extent_changeset *data_reserved = NULL; loff_t offset = iocb->ki_pos; size_t count = 0; + int flags = 0; + bool wakeup = true; bool relock = false; ssize_t ret; if (check_direct_IO(fs_info, iter, offset)) return 0; + inode_dio_begin(inode); + + /* + * The generic stuff only does filemap_write_and_wait_range, which + * isn't enough if we've written compressed pages to this area, so + * we need to flush the dirty pages again to make absolutely sure + * that any outstanding dirty pages are on disk. + */ count = iov_iter_count(iter); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_fdatawrite_range(inode->i_mapping, offset, + offset + count - 1); + if (iov_iter_rw(iter) == WRITE) { /* * If the write DIO is beyond the EOF, we need update @@ -7887,24 +7862,71 @@ ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * not unlock the i_mutex at this case. */ if (offset + count <= inode->i_size) { + dio_data.overwrite = 1; inode_unlock(inode); relock = true; } else if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; } + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, + offset, count); + if (ret) + goto out; + + /* + * We need to know how many extents we reserved so that we can + * do the accounting properly if we go over the number we + * originally calculated. Abuse current->journal_info for this. + */ + dio_data.reserve = round_up(count, + fs_info->sectorsize); + dio_data.unsubmitted_oe_range_start = (u64)offset; + dio_data.unsubmitted_oe_range_end = (u64)offset; + current->journal_info = &dio_data; down_read(&BTRFS_I(inode)->dio_sem); + } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags)) { + inode_dio_end(inode); + flags = DIO_LOCKING | DIO_SKIP_HOLES; + wakeup = false; } - ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dops, - is_sync_kiocb(iocb)); - + ret = __blockdev_direct_IO(iocb, inode, + fs_info->fs_devices->latest_bdev, + iter, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, flags); if (iov_iter_rw(iter) == WRITE) { up_read(&BTRFS_I(inode)->dio_sem); + current->journal_info = NULL; + if (ret < 0 && ret != -EIOCBQUEUED) { + if (dio_data.reserve) + btrfs_delalloc_release_space(inode, data_reserved, + offset, dio_data.reserve, true); + /* + * On error we might have left some ordered extents + * without submitting corresponding bios for them, so + * cleanup them up to avoid other tasks getting them + * and waiting for them to complete forever. + */ + if (dio_data.unsubmitted_oe_range_start < + dio_data.unsubmitted_oe_range_end) + __endio_write_update_ordered(inode, + dio_data.unsubmitted_oe_range_start, + dio_data.unsubmitted_oe_range_end - + dio_data.unsubmitted_oe_range_start, + false); + } else if (ret >= 0 && (size_t)ret < count) + btrfs_delalloc_release_space(inode, data_reserved, + offset, count - (size_t)ret, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), count); } out: + if (wakeup) + inode_dio_end(inode); if (relock) inode_lock(inode); + extent_changeset_free(data_reserved); return ret; } @@ -10220,7 +10242,7 @@ static const struct address_space_operations btrfs_aops = { .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readpages = btrfs_readpages, - .direct_IO = noop_direct_IO, + .direct_IO = btrfs_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, #ifdef CONFIG_MIGRATION