Skip to content

Commit

Permalink
mm/filemap: remove hugetlb special casing in filemap.c
Browse files Browse the repository at this point in the history
Remove special cased hugetlb handling code within the page cache by
changing the granularity of ->index to the base page size rather than the
huge page size.  The motivation of this patch is to reduce complexity
within the filemap code while also increasing performance by removing
branches that are evaluated on every page cache lookup.

To support the change in index, new wrappers for hugetlb page cache
interactions are added.  These wrappers perform the conversion to a linear
index which is now expected by the page cache for huge pages.

========================= PERFORMANCE ======================================

Perf was used to check the performance differences after the patch. 
Overall the performance is similar to mainline with a very small larger
overhead that occurs in __filemap_add_folio() and
hugetlb_add_to_page_cache().  This is because of the larger overhead that
occurs in xa_load() and xa_store() as the xarray is now using more entries
to store hugetlb folios in the page cache.

Timing

aarch64
    2MB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-1 hugepages]# time fallocate -l 700GB test.txt
            real    1m49.568s
            user    0m0.000s
            sys     1m49.461s

        6.5-rc3:
            [root]# time fallocate -l 700GB test.txt
            real    1m47.495s
            user    0m0.000s
            sys     1m47.370s
    1GB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt
            real    1m47.024s
            user    0m0.000s
            sys     1m46.921s

        6.5-rc3:
            [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt
            real    1m44.551s
            user    0m0.000s
            sys     1m44.438s

x86
    2MB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-2 hugepages]# time fallocate -l 100GB test.txt
            real    0m22.383s
            user    0m0.000s
            sys     0m22.255s

        6.5-rc3:
            [opc@sidhakum-ol9-2 hugepages]$ time sudo fallocate -l 100GB /dev/hugepages/test.txt
            real    0m22.735s
            user    0m0.038s
            sys     0m22.567s

    1GB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-2 hugepages1GB]# time fallocate -l 100GB test.txt
            real    0m25.786s
            user    0m0.001s
            sys     0m25.589s

        6.5-rc3:
            [root@sidhakum-ol9-2 hugepages1G]# time fallocate -l 100GB test.txt
            real    0m33.454s
            user    0m0.001s
            sys     0m33.193s

aarch64:
    workload - fallocate a 700GB file backed by huge pages

    6.5-rc3 + this patch:
        2MB Page Size:
            --100.00%--__arm64_sys_fallocate
                          ksys_fallocate
                          vfs_fallocate
                          hugetlbfs_fallocate
                          |
                          |--95.04%--__pi_clear_page
                          |
                          |--3.57%--clear_huge_page
                          |          |
                          |          |--2.63%--rcu_all_qs
                          |          |
                          |           --0.91%--__cond_resched
                          |
                           --0.67%--__cond_resched
            0.17%     0.00%             0  fallocate  [kernel.vmlinux]       [k] hugetlb_add_to_page_cache
            0.14%     0.10%            11  fallocate  [kernel.vmlinux]       [k] __filemap_add_folio

    6.5-rc3
        2MB Page Size:
                --100.00%--__arm64_sys_fallocate
                          ksys_fallocate
                          vfs_fallocate
                          hugetlbfs_fallocate
                          |
                          |--94.91%--__pi_clear_page
                          |
                          |--4.11%--clear_huge_page
                          |          |
                          |          |--3.00%--rcu_all_qs
                          |          |
                          |           --1.10%--__cond_resched
                          |
                           --0.59%--__cond_resched
            0.08%     0.01%             1  fallocate  [kernel.kallsyms]  [k] hugetlb_add_to_page_cache
            0.05%     0.03%             3  fallocate  [kernel.kallsyms]  [k] __filemap_add_folio

x86
    workload - fallocate a 100GB file backed by huge pages

    6.5-rc3 + this patch:
        2MB Page Size:
            hugetlbfs_fallocate
            |
            --99.57%--clear_huge_page
                |
                --98.47%--clear_page_erms
                    |
                    --0.53%--asm_sysvec_apic_timer_interrupt

            0.04%     0.04%             1  fallocate  [kernel.kallsyms]     [k] xa_load
            0.04%     0.00%             0  fallocate  [kernel.kallsyms]     [k] hugetlb_add_to_page_cache
            0.04%     0.00%             0  fallocate  [kernel.kallsyms]     [k] __filemap_add_folio
            0.04%     0.00%             0  fallocate  [kernel.kallsyms]     [k] xas_store

    6.5-rc3
        2MB Page Size:
                --99.93%--__x64_sys_fallocate
                          vfs_fallocate
                          hugetlbfs_fallocate
                          |
                           --99.38%--clear_huge_page
                                     |
                                     |--98.40%--clear_page_erms
                                     |
                                      --0.59%--__cond_resched
            0.03%     0.03%             1  fallocate  [kernel.kallsyms]  [k] __filemap_add_folio

========================= TESTING ======================================

This patch passes libhugetlbfs tests and LTP hugetlb tests

********** TEST SUMMARY
*                      2M
*                      32-bit 64-bit
*     Total testcases:   110    113
*             Skipped:     0      0
*                PASS:   107    113
*                FAIL:     0      0
*    Killed by signal:     3      0
*   Bad configuration:     0      0
*       Expected FAIL:     0      0
*     Unexpected PASS:     0      0
*    Test not present:     0      0
* Strange test result:     0      0
**********

    Done executing testcases.
    LTP Version:  20220527-178-g2761a81c4

page migration was also tested using Mike Kravetz's test program.[8]

[[email protected]: fix an NULL vs IS_ERR() bug]
  Link: https://lkml.kernel.org/r/[email protected]
Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Sidhartha Kumar <[email protected]>
Signed-off-by: Dan Carpenter <[email protected]>
Reported-and-tested-by: [email protected]
Closes: https://syzkaller.appspot.com/bug?extid=c225dea486da4d5592bd
Cc: Matthew Wilcox (Oracle) <[email protected]>
Cc: Mike Kravetz <[email protected]>
Cc: Muchun Song <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
  • Loading branch information
sidkumar99 authored and akpm00 committed Oct 16, 2023
1 parent 0374af1 commit a08c719
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 101 deletions.
37 changes: 19 additions & 18 deletions fs/hugetlbfs/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
ssize_t retval = 0;

while (iov_iter_count(to)) {
struct page *page;
struct folio *folio;
size_t nr, copied, want;

/* nr is the maximum number of bytes to copy from this page */
Expand All @@ -352,38 +352,38 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
nr = nr - offset;

/* Find the page */
page = find_lock_page(mapping, index);
if (unlikely(page == NULL)) {
/* Find the folio */
folio = filemap_lock_hugetlb_folio(h, mapping, index);
if (IS_ERR(folio)) {
/*
* We have a HOLE, zero out the user-buffer for the
* length of the hole or request.
*/
copied = iov_iter_zero(nr, to);
} else {
unlock_page(page);
folio_unlock(folio);

if (!PageHWPoison(page))
if (!folio_test_has_hwpoisoned(folio))
want = nr;
else {
/*
* Adjust how many bytes safe to read without
* touching the 1st raw HWPOISON subpage after
* offset.
*/
want = adjust_range_hwpoison(page, offset, nr);
want = adjust_range_hwpoison(&folio->page, offset, nr);
if (want == 0) {
put_page(page);
folio_put(folio);
retval = -EIO;
break;
}
}

/*
* We have the page, copy it to user space buffer.
* We have the folio, copy it to user space buffer.
*/
copied = copy_page_to_iter(page, offset, want, to);
put_page(page);
copied = copy_folio_to_iter(folio, offset, want, to);
folio_put(folio);
}
offset += copied;
retval += copied;
Expand Down Expand Up @@ -661,21 +661,20 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
{
struct hstate *h = hstate_inode(inode);
struct address_space *mapping = &inode->i_data;
const pgoff_t start = lstart >> huge_page_shift(h);
const pgoff_t end = lend >> huge_page_shift(h);
const pgoff_t end = lend >> PAGE_SHIFT;
struct folio_batch fbatch;
pgoff_t next, index;
int i, freed = 0;
bool truncate_op = (lend == LLONG_MAX);

folio_batch_init(&fbatch);
next = start;
next = lstart >> PAGE_SHIFT;
while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
for (i = 0; i < folio_batch_count(&fbatch); ++i) {
struct folio *folio = fbatch.folios[i];
u32 hash = 0;

index = folio->index;
index = folio->index >> huge_page_order(h);
hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);

Expand All @@ -693,7 +692,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
}

if (truncate_op)
(void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
(void)hugetlb_unreserve_pages(inode,
lstart >> huge_page_shift(h),
LONG_MAX, freed);
}

static void hugetlbfs_evict_inode(struct inode *inode)
Expand Down Expand Up @@ -741,7 +742,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h,
pgoff_t idx = start >> huge_page_shift(h);
struct folio *folio;

folio = filemap_lock_folio(mapping, idx);
folio = filemap_lock_hugetlb_folio(h, mapping, idx);
if (IS_ERR(folio))
return;

Expand Down Expand Up @@ -886,7 +887,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
mutex_lock(&hugetlb_fault_mutex_table[hash]);

/* See if already present in mapping to avoid alloc/free */
folio = filemap_get_folio(mapping, index);
folio = filemap_get_folio(mapping, index << huge_page_order(h));
if (!IS_ERR(folio)) {
folio_put(folio);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
Expand Down
12 changes: 12 additions & 0 deletions include/linux/hugetlb.h
Original file line number Diff line number Diff line change
Expand Up @@ -812,6 +812,12 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h)
return huge_page_size(h) / 512;
}

static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
struct address_space *mapping, pgoff_t idx)
{
return filemap_lock_folio(mapping, idx << huge_page_order(h));
}

#include <asm/hugetlb.h>

#ifndef is_hugepage_only_range
Expand Down Expand Up @@ -1008,6 +1014,12 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio
return NULL;
}

static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
struct address_space *mapping, pgoff_t idx)
{
return NULL;
}

static inline int isolate_or_dissolve_huge_page(struct page *page,
struct list_head *list)
{
Expand Down
32 changes: 2 additions & 30 deletions include/linux/pagemap.h
Original file line number Diff line number Diff line change
Expand Up @@ -789,9 +789,6 @@ static inline pgoff_t folio_next_index(struct folio *folio)
*/
static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
{
/* HugeTLBfs indexes the page cache in units of hpage_size */
if (folio_test_hugetlb(folio))
return &folio->page;
return folio_page(folio, index & (folio_nr_pages(folio) - 1));
}

Expand All @@ -807,9 +804,6 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
*/
static inline bool folio_contains(struct folio *folio, pgoff_t index)
{
/* HugeTLBfs indexes the page cache in units of hpage_size */
if (folio_test_hugetlb(folio))
return folio->index == index;
return index - folio_index(folio) < folio_nr_pages(folio);
}

Expand Down Expand Up @@ -867,10 +861,9 @@ static inline struct folio *read_mapping_folio(struct address_space *mapping,
}

/*
* Get index of the page within radix-tree (but not for hugetlb pages).
* (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
* Get the offset in PAGE_SIZE (even for hugetlb pages).
*/
static inline pgoff_t page_to_index(struct page *page)
static inline pgoff_t page_to_pgoff(struct page *page)
{
struct page *head;

Expand All @@ -885,19 +878,6 @@ static inline pgoff_t page_to_index(struct page *page)
return head->index + page - head;
}

extern pgoff_t hugetlb_basepage_index(struct page *page);

/*
* Get the offset in PAGE_SIZE (even for hugetlb pages).
* (TODO: hugetlb pages should have ->index in PAGE_SIZE)
*/
static inline pgoff_t page_to_pgoff(struct page *page)
{
if (unlikely(PageHuge(page)))
return hugetlb_basepage_index(page);
return page_to_index(page);
}

/*
* Return byte-offset into filesystem object for page.
*/
Expand Down Expand Up @@ -934,24 +914,16 @@ static inline loff_t folio_file_pos(struct folio *folio)

/*
* Get the offset in PAGE_SIZE (even for hugetlb folios).
* (TODO: hugetlb folios should have ->index in PAGE_SIZE)
*/
static inline pgoff_t folio_pgoff(struct folio *folio)
{
if (unlikely(folio_test_hugetlb(folio)))
return hugetlb_basepage_index(&folio->page);
return folio->index;
}

extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
unsigned long address);

static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
unsigned long address)
{
pgoff_t pgoff;
if (unlikely(is_vm_hugetlb_page(vma)))
return linear_hugepage_index(vma, address);
pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
pgoff += vma->vm_pgoff;
return pgoff;
Expand Down
34 changes: 10 additions & 24 deletions mm/filemap.c
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,8 @@ static void page_cache_delete(struct address_space *mapping,

mapping_set_update(&xas, mapping);

/* hugetlb pages are represented by a single entry in the xarray */
if (!folio_test_hugetlb(folio)) {
xas_set_order(&xas, folio->index, folio_order(folio));
nr = folio_nr_pages(folio);
}
xas_set_order(&xas, folio->index, folio_order(folio));
nr = folio_nr_pages(folio);

VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

Expand Down Expand Up @@ -234,7 +231,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio)
if (free_folio)
free_folio(folio);

if (folio_test_large(folio) && !folio_test_hugetlb(folio))
if (folio_test_large(folio))
refs = folio_nr_pages(folio);
folio_put_refs(folio, refs);
}
Expand Down Expand Up @@ -855,14 +852,15 @@ noinline int __filemap_add_folio(struct address_space *mapping,

if (!huge) {
int error = mem_cgroup_charge(folio, NULL, gfp);
VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
if (error)
return error;
charged = true;
xas_set_order(&xas, index, folio_order(folio));
nr = folio_nr_pages(folio);
}

VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
xas_set_order(&xas, index, folio_order(folio));
nr = folio_nr_pages(folio);

gfp &= GFP_RECLAIM_MASK;
folio_ref_add(folio, nr);
folio->mapping = mapping;
Expand Down Expand Up @@ -2040,7 +2038,7 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
int idx = folio_batch_count(fbatch) - 1;

folio = fbatch->folios[idx];
if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
if (!xa_is_value(folio))
nr = folio_nr_pages(folio);
*start = indices[idx] + nr;
}
Expand Down Expand Up @@ -2104,7 +2102,7 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
int idx = folio_batch_count(fbatch) - 1;

folio = fbatch->folios[idx];
if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
if (!xa_is_value(folio))
nr = folio_nr_pages(folio);
*start = indices[idx] + nr;
}
Expand Down Expand Up @@ -2145,9 +2143,6 @@ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
continue;
if (!folio_batch_add(fbatch, folio)) {
unsigned long nr = folio_nr_pages(folio);

if (folio_test_hugetlb(folio))
nr = 1;
*start = folio->index + nr;
goto out;
}
Expand Down Expand Up @@ -2213,9 +2208,6 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,

if (!folio_batch_add(fbatch, folio)) {
nr = folio_nr_pages(folio);

if (folio_test_hugetlb(folio))
nr = 1;
*start = folio->index + nr;
goto out;
}
Expand All @@ -2232,10 +2224,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,

if (nr) {
folio = fbatch->folios[nr - 1];
if (folio_test_hugetlb(folio))
*start = folio->index + 1;
else
*start = folio_next_index(folio);
*start = folio->index + folio_nr_pages(folio);
}
out:
rcu_read_unlock();
Expand Down Expand Up @@ -2273,9 +2262,6 @@ unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
continue;
if (!folio_batch_add(fbatch, folio)) {
unsigned long nr = folio_nr_pages(folio);

if (folio_test_hugetlb(folio))
nr = 1;
*start = folio->index + nr;
goto out;
}
Expand Down
Loading

0 comments on commit a08c719

Please sign in to comment.