Skip to content

Commit

Permalink
dio: optimize cache misses in the submission path
Browse files Browse the repository at this point in the history
Some investigation of a transaction processing workload showed that a
major consumer of cycles in __blockdev_direct_IO is the cache miss while
accessing the block size.  This is because it has to walk the chain from
block_dev to gendisk to queue.

The block size is needed early on to check alignment and sizes.  It's only
done if the check for the inode block size fails.  But the costly block
device state is unconditionally fetched.

- Reorganize the code to only fetch block dev state when actually
  needed.

Then do a prefetch on the block dev early on in the direct IO path.  This
is worth it, because there is substantial code run before we actually
touch the block dev now.

- I also added some unlikelies to make it clear the compiler that block
  device fetch code is not normally executed.

This gave a small, but measurable improvement on a large database
benchmark (about 0.3%)

[[email protected]: coding-style fixes]
[[email protected]: using prefetch requires including prefetch.h]
Signed-off-by: Andi Kleen <[email protected]>
Cc: Jeff Moyer <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Signed-off-by: Stephen Rothwell <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
Andi Kleen authored and torvalds committed Jan 13, 2012
1 parent 87192a2 commit 65dd2aa
Showing 1 changed file with 37 additions and 9 deletions.
46 changes: 37 additions & 9 deletions fs/direct-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <linux/rwsem.h>
#include <linux/uio.h>
#include <linux/atomic.h>
#include <linux/prefetch.h>

/*
* How many user pages to map in one call to get_user_pages(). This determines
Expand Down Expand Up @@ -1087,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio)
* individual fields and will generate much worse code. This is important
* for the whole file.
*/
ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
static inline ssize_t
do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
dio_submit_t submit_io, int flags)
Expand All @@ -1097,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
size_t size;
unsigned long addr;
unsigned blkbits = inode->i_blkbits;
unsigned bdev_blkbits = 0;
unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL;
loff_t end = offset;
Expand All @@ -1110,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (rw & WRITE)
rw = WRITE_ODIRECT;

if (bdev)
bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
/*
* Avoid references to bdev if not absolutely needed to give
* the early prefetch in the caller enough time.
*/

if (offset & blocksize_mask) {
if (bdev)
blkbits = bdev_blkbits;
blkbits = blksize_bits(bdev_logical_block_size(bdev));
blocksize_mask = (1 << blkbits) - 1;
if (offset & blocksize_mask)
goto out;
Expand All @@ -1126,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
addr = (unsigned long)iov[seg].iov_base;
size = iov[seg].iov_len;
end += size;
if ((addr & blocksize_mask) || (size & blocksize_mask)) {
if (unlikely((addr & blocksize_mask) ||
(size & blocksize_mask))) {
if (bdev)
blkbits = bdev_blkbits;
blkbits = blksize_bits(
bdev_logical_block_size(bdev));
blocksize_mask = (1 << blkbits) - 1;
if ((addr & blocksize_mask) || (size & blocksize_mask))
if ((addr & blocksize_mask) || (size & blocksize_mask))
goto out;
}
}
Expand Down Expand Up @@ -1313,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
out:
return retval;
}

ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
dio_submit_t submit_io, int flags)
{
/*
* The block device state is needed in the end to finally
* submit everything. Since it's likely to be cache cold
* prefetch it here as first thing to hide some of the
* latency.
*
* Attempt to prefetch the pieces we likely need later.
*/
prefetch(&bdev->bd_disk->part_tbl);
prefetch(bdev->bd_queue);
prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);

return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
nr_segs, get_block, end_io,
submit_io, flags);
}

EXPORT_SYMBOL(__blockdev_direct_IO);

static __init int dio_init(void)
Expand Down

0 comments on commit 65dd2aa

Please sign in to comment.