Skip to content

Commit

Permalink
mm/filemap.c: generic_file_buffered_read() now uses find_get_pages_co…
Browse files Browse the repository at this point in the history
…ntig

Convert generic_file_buffered_read() to get pages to read from in batches,
and then copy data to userspace from many pages at once - in particular,
we now don't touch any cachelines that might be contended while we're in
the loop to copy data to userspace.

This is is a performance improvement on workloads that do buffered reads
with large blocksizes, and a very large performance improvement if that
file is also being accessed concurrently by different threads.

On smaller reads (512 bytes), there's a very small performance improvement
(1%, within the margin of error).

akpm: kernel test robot found a 32% speedup on one test:
https://lkml.kernel.org/r/20201030081456.GY31092@shao2-debian

Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Kent Overstreet <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Matthew Wilcox (Oracle) <[email protected]>
Cc: kernel test robot <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
koverstreet authored and torvalds committed Dec 15, 2020
1 parent 723ef24 commit 06c0444
Showing 1 changed file with 175 additions and 138 deletions.
313 changes: 175 additions & 138 deletions mm/filemap.c
Original file line number Diff line number Diff line change
Expand Up @@ -2176,67 +2176,6 @@ static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
return lock_page_killable(page);
}

static int generic_file_buffered_read_page_ok(struct kiocb *iocb,
struct iov_iter *iter,
struct page *page)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
struct file_ra_state *ra = &iocb->ki_filp->f_ra;
unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
unsigned int bytes, copied;
loff_t isize, end_offset;

BUG_ON(iocb->ki_pos >> PAGE_SHIFT != page->index);

/*
* i_size must be checked after we know the page is Uptodate.
*
* Checking i_size after the check allows us to calculate
* the correct value for "bytes", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/

isize = i_size_read(inode);
if (unlikely(iocb->ki_pos >= isize))
return 1;

end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);

bytes = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset);

/* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);

/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
*/

copied = copy_page_to_iter(page, offset, bytes, iter);

iocb->ki_pos += copied;

/*
* When a sequential read accesses a page several times,
* only mark it as accessed the first time.
*/
if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT)
mark_page_accessed(page);

ra->prev_pos = iocb->ki_pos;

if (copied < bytes)
return -EFAULT;

return !iov_iter_count(iter) || iocb->ki_pos == isize;
}

static struct page *
generic_file_buffered_read_readpage(struct kiocb *iocb,
struct file *filp,
Expand Down Expand Up @@ -2394,6 +2333,92 @@ generic_file_buffered_read_no_cached_page(struct kiocb *iocb,
return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
}

static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
struct iov_iter *iter,
struct page **pages,
unsigned int nr)
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
struct file_ra_state *ra = &filp->f_ra;
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
int i, j, nr_got, err = 0;

nr = min_t(unsigned long, last_index - index, nr);
find_page:
if (fatal_signal_pending(current))
return -EINTR;

nr_got = find_get_pages_contig(mapping, index, nr, pages);
if (nr_got)
goto got_pages;

if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;

page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);

nr_got = find_get_pages_contig(mapping, index, nr, pages);
if (nr_got)
goto got_pages;

pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
err = PTR_ERR_OR_ZERO(pages[0]);
if (!IS_ERR_OR_NULL(pages[0]))
nr_got = 1;
got_pages:
for (i = 0; i < nr_got; i++) {
struct page *page = pages[i];
pgoff_t pg_index = index + i;
loff_t pg_pos = max(iocb->ki_pos,
(loff_t) pg_index << PAGE_SHIFT);
loff_t pg_count = iocb->ki_pos + iter->count - pg_pos;

if (PageReadahead(page)) {
if (iocb->ki_flags & IOCB_NOIO) {
for (j = i; j < nr_got; j++)
put_page(pages[j]);
nr_got = i;
err = -EAGAIN;
break;
}
page_cache_async_readahead(mapping, ra, filp, page,
pg_index, last_index - pg_index);
}

if (!PageUptodate(page)) {
if ((iocb->ki_flags & IOCB_NOWAIT) ||
((iocb->ki_flags & IOCB_WAITQ) && i)) {
for (j = i; j < nr_got; j++)
put_page(pages[j]);
nr_got = i;
err = -EAGAIN;
break;
}

page = generic_file_buffered_read_pagenotuptodate(iocb,
filp, iter, page, pg_pos, pg_count);
if (IS_ERR_OR_NULL(page)) {
for (j = i + 1; j < nr_got; j++)
put_page(pages[j]);
nr_got = i;
err = PTR_ERR_OR_ZERO(page);
break;
}
}
}

if (likely(nr_got))
return nr_got;
if (err)
return err;
/*
* No pages and no error means we raced and should retry:
*/
goto find_page;
}

/**
* generic_file_buffered_read - generic file read routine
* @iocb: the iocb to read
Expand All @@ -2414,104 +2439,116 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
struct iov_iter *iter, ssize_t written)
{
struct file *filp = iocb->ki_filp;
struct file_ra_state *ra = &filp->f_ra;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
struct file_ra_state *ra = &filp->f_ra;
size_t orig_count = iov_iter_count(iter);
pgoff_t last_index;
int error = 0;
struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
unsigned int nr_pages = min_t(unsigned int, 512,
((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
(iocb->ki_pos >> PAGE_SHIFT));
int i, pg_nr, error = 0;
bool writably_mapped;
loff_t isize, end_offset;

if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
return 0;
iov_iter_truncate(iter, inode->i_sb->s_maxbytes);

last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
if (nr_pages > ARRAY_SIZE(pages_onstack))
pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);

/*
* If we've already successfully copied some data, then we
* can no longer safely return -EIOCBQUEUED. Hence mark
* an async read NOWAIT at that point.
*/
if (written && (iocb->ki_flags & IOCB_WAITQ))
iocb->ki_flags |= IOCB_NOWAIT;

for (;;) {
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
struct page *page;
if (!pages) {
pages = pages_onstack;
nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
}

do {
cond_resched();
find_page:
if (fatal_signal_pending(current)) {
error = -EINTR;
goto out;
}

/*
* We can't return -EIOCBQUEUED once we've done some work, so
* ensure we don't block:
* If we've already successfully copied some data, then we
* can no longer safely return -EIOCBQUEUED. Hence mark
* an async read NOWAIT at that point.
*/
if ((iocb->ki_flags & IOCB_WAITQ) &&
(written + orig_count - iov_iter_count(iter)))
if ((iocb->ki_flags & IOCB_WAITQ) && written)
iocb->ki_flags |= IOCB_NOWAIT;

page = find_get_page(mapping, index);
if (!page) {
if (iocb->ki_flags & IOCB_NOIO)
goto would_block;
page_cache_sync_readahead(mapping,
ra, filp,
index, last_index - index);
page = find_get_page(mapping, index);
if (unlikely(page == NULL)) {
page = generic_file_buffered_read_no_cached_page(iocb, iter);
if (!page)
goto find_page;
if (IS_ERR(page)) {
error = PTR_ERR(page);
goto out;
}
}
}
if (PageReadahead(page)) {
if (iocb->ki_flags & IOCB_NOIO) {
put_page(page);
goto out;
}
page_cache_async_readahead(mapping,
ra, filp, page,
index, last_index - index);
}
if (!PageUptodate(page)) {
if (iocb->ki_flags & IOCB_NOWAIT) {
put_page(page);
error = -EAGAIN;
goto out;
}
page = generic_file_buffered_read_pagenotuptodate(iocb,
filp, iter, page, iocb->ki_pos, iter->count);
if (!page)
goto find_page;
if (IS_ERR(page)) {
error = PTR_ERR(page);
goto out;
}
i = 0;
pg_nr = generic_file_buffered_read_get_pages(iocb, iter,
pages, nr_pages);
if (pg_nr < 0) {
error = pg_nr;
break;
}

error = generic_file_buffered_read_page_ok(iocb, iter, page);
put_page(page);
/*
* i_size must be checked after we know the pages are Uptodate.
*
* Checking i_size after the check allows us to calculate
* the correct value for "nr", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
isize = i_size_read(inode);
if (unlikely(iocb->ki_pos >= isize))
goto put_pages;

if (error) {
if (error > 0)
error = 0;
goto out;
end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);

while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr >
(end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT)
put_page(pages[--pg_nr]);

/*
* Once we start copying data, we don't want to be touching any
* cachelines that might be contended:
*/
writably_mapped = mapping_writably_mapped(mapping);

/*
* When a sequential read accesses a page several times, only
* mark it as accessed the first time.
*/
if (iocb->ki_pos >> PAGE_SHIFT !=
ra->prev_pos >> PAGE_SHIFT)
mark_page_accessed(pages[0]);
for (i = 1; i < pg_nr; i++)
mark_page_accessed(pages[i]);

for (i = 0; i < pg_nr; i++) {
unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos,
PAGE_SIZE - offset);
unsigned int copied;

/*
* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (writably_mapped)
flush_dcache_page(pages[i]);

copied = copy_page_to_iter(pages[i], offset, bytes, iter);

written += copied;
iocb->ki_pos += copied;
ra->prev_pos = iocb->ki_pos;

if (copied < bytes) {
error = -EFAULT;
break;
}
}
}
put_pages:
for (i = 0; i < pg_nr; i++)
put_page(pages[i]);
} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);

would_block:
error = -EAGAIN;
out:
file_accessed(filp);
written += orig_count - iov_iter_count(iter);

if (pages != pages_onstack)
kfree(pages);

return written ? written : error;
}
Expand Down

0 comments on commit 06c0444

Please sign in to comment.