Skip to content

Commit

Permalink
mm/gup: handle hugepd for follow_page()
Browse files Browse the repository at this point in the history
Hugepd is only used in PowerPC so far on 4K page size kernels where hash
mmu is used.  follow_page_mask() used to leverage hugetlb APIs to access
hugepd entries.  Teach follow_page_mask() itself on hugepd.

With previous refactors on fast-gup gup_huge_pd(), most of the code can be
leveraged.  There's something not needed for follow page, for example,
gup_hugepte() tries to detect pgtable entry change which will never happen
with slow gup (which has the pgtable lock held), but that's not a problem
to check.

Since follow_page() always only fetch one page, set the end to "address +
PAGE_SIZE" should suffice.  We will still do the pgtable walk once for
each hugetlb page by setting ctx->page_mask properly.

One thing worth mentioning is that some level of pgtable's _bad() helper
will report is_hugepd() entries as TRUE on Power8 hash MMUs.  I think it
at least applies to PUD on Power8 with 4K pgsize.  It means feeding a
hugepd entry to pud_bad() will report a false positive.  Let's leave that
for now because it can be arch-specific where I am a bit declined to
touch.  In this patch it's not a problem as long as hugepd is detected
before any bad pgtable entries.

To allow slow gup like follow_*_page() to access hugepd helpers, hugepd
codes are moved to the top.  Besides that, the helper record_subpages()
will be used by either hugepd or fast-gup now.  To avoid "unused function"
warnings we must provide a "#ifdef" for it, unfortunately.

Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Peter Xu <[email protected]>
Tested-by: Ryan Roberts <[email protected]>
Cc: Andrea Arcangeli <[email protected]>
Cc: Andrew Jones <[email protected]>
Cc: Aneesh Kumar K.V (IBM) <[email protected]>
Cc: Axel Rasmussen <[email protected]>
Cc: Christophe Leroy <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: David Hildenbrand <[email protected]>
Cc: James Houghton <[email protected]>
Cc: Jason Gunthorpe <[email protected]>
Cc: John Hubbard <[email protected]>
Cc: Kirill A. Shutemov <[email protected]>
Cc: Lorenzo Stoakes <[email protected]>
Cc: Matthew Wilcox (Oracle) <[email protected]>
Cc: Michael Ellerman <[email protected]>
Cc: "Mike Rapoport (IBM)" <[email protected]>
Cc: Muchun Song <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Vlastimil Babka <[email protected]>
Cc: Yang Shi <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
  • Loading branch information
xzpeter authored and akpm00 committed Apr 26, 2024
1 parent 4418c52 commit a12083d
Showing 1 changed file with 163 additions and 106 deletions.
269 changes: 163 additions & 106 deletions mm/gup.c
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,149 @@ static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
}

#ifdef CONFIG_MMU

#if defined(CONFIG_ARCH_HAS_HUGEPD) || defined(CONFIG_HAVE_FAST_GUP)
static int record_subpages(struct page *page, unsigned long sz,
unsigned long addr, unsigned long end,
struct page **pages)
{
struct page *start_page;
int nr;

start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT);
for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
pages[nr] = nth_page(start_page, nr);

return nr;
}
#endif /* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_FAST_GUP */

#ifdef CONFIG_ARCH_HAS_HUGEPD
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
unsigned long sz)
{
unsigned long __boundary = (addr + sz) & ~(sz-1);
return (__boundary - 1 < end - 1) ? __boundary : end;
}

static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
unsigned long pte_end;
struct page *page;
struct folio *folio;
pte_t pte;
int refs;

pte_end = (addr + sz) & ~(sz-1);
if (pte_end < end)
end = pte_end;

pte = huge_ptep_get(ptep);

if (!pte_access_permitted(pte, flags & FOLL_WRITE))
return 0;

/* hugepages are never "special" */
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));

page = pte_page(pte);
refs = record_subpages(page, sz, addr, end, pages + *nr);

folio = try_grab_folio(page, refs, flags);
if (!folio)
return 0;

if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
gup_put_folio(folio, refs, flags);
return 0;
}

if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
gup_put_folio(folio, refs, flags);
return 0;
}

*nr += refs;
folio_set_referenced(folio);
return 1;
}

/*
* NOTE: currently GUP for a hugepd is only possible on hugetlbfs file
* systems on Power, which does not have issue with folio writeback against
* GUP updates. When hugepd will be extended to support non-hugetlbfs or
* even anonymous memory, we need to do extra check as what we do with most
* of the other folios. See writable_file_mapping_allowed() and
* gup_fast_folio_allowed() for more information.
*/
static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
unsigned int pdshift, unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
pte_t *ptep;
unsigned long sz = 1UL << hugepd_shift(hugepd);
unsigned long next;

ptep = hugepte_offset(hugepd, addr, pdshift);
do {
next = hugepte_addr_end(addr, end, sz);
if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
return 0;
} while (ptep++, addr = next, addr != end);

return 1;
}

static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
unsigned long addr, unsigned int pdshift,
unsigned int flags,
struct follow_page_context *ctx)
{
struct page *page;
struct hstate *h;
spinlock_t *ptl;
int nr = 0, ret;
pte_t *ptep;

/* Only hugetlb supports hugepd */
if (WARN_ON_ONCE(!is_vm_hugetlb_page(vma)))
return ERR_PTR(-EFAULT);

h = hstate_vma(vma);
ptep = hugepte_offset(hugepd, addr, pdshift);
ptl = huge_pte_lock(h, vma->vm_mm, ptep);
ret = gup_huge_pd(hugepd, addr, pdshift, addr + PAGE_SIZE,
flags, &page, &nr);
spin_unlock(ptl);

if (ret) {
WARN_ON_ONCE(nr != 1);
ctx->page_mask = (1U << huge_page_order(h)) - 1;
return page;
}

return NULL;
}
#else /* CONFIG_ARCH_HAS_HUGEPD */
static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
unsigned int pdshift, unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
return 0;
}

static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
unsigned long addr, unsigned int pdshift,
unsigned int flags,
struct follow_page_context *ctx)
{
return NULL;
}
#endif /* CONFIG_ARCH_HAS_HUGEPD */


static struct page *no_page_table(struct vm_area_struct *vma,
unsigned int flags, unsigned long address)
{
Expand Down Expand Up @@ -868,6 +1011,9 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
return no_page_table(vma, flags, address);
if (!pmd_present(pmdval))
return no_page_table(vma, flags, address);
if (unlikely(is_hugepd(__hugepd(pmd_val(pmdval)))))
return follow_hugepd(vma, __hugepd(pmd_val(pmdval)),
address, PMD_SHIFT, flags, ctx);
if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
Expand Down Expand Up @@ -918,6 +1064,9 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
pud = READ_ONCE(*pudp);
if (!pud_present(pud))
return no_page_table(vma, flags, address);
if (unlikely(is_hugepd(__hugepd(pud_val(pud)))))
return follow_hugepd(vma, __hugepd(pud_val(pud)),
address, PUD_SHIFT, flags, ctx);
if (pud_leaf(pud)) {
ptl = pud_lock(mm, pudp);
page = follow_huge_pud(vma, address, pudp, flags, ctx);
Expand All @@ -941,10 +1090,13 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,

p4dp = p4d_offset(pgdp, address);
p4d = READ_ONCE(*p4dp);
if (!p4d_present(p4d))
return no_page_table(vma, flags, address);
BUILD_BUG_ON(p4d_leaf(p4d));
if (unlikely(p4d_bad(p4d)))

if (unlikely(is_hugepd(__hugepd(p4d_val(p4d)))))
return follow_hugepd(vma, __hugepd(p4d_val(p4d)),
address, P4D_SHIFT, flags, ctx);

if (!p4d_present(p4d) || p4d_bad(p4d))
return no_page_table(vma, flags, address);

return follow_pud_mask(vma, address, p4dp, flags, ctx);
Expand Down Expand Up @@ -994,10 +1146,15 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,

pgd = pgd_offset(mm, address);

if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
return no_page_table(vma, flags, address);
if (unlikely(is_hugepd(__hugepd(pgd_val(*pgd)))))
page = follow_hugepd(vma, __hugepd(pgd_val(*pgd)),
address, PGDIR_SHIFT, flags, ctx);
else if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
page = no_page_table(vma, flags, address);
else
page = follow_p4d_mask(vma, address, pgd, flags, ctx);

return follow_p4d_mask(vma, address, pgd, flags, ctx);
return page;
}

struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
Expand Down Expand Up @@ -2954,106 +3111,6 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
}
#endif

static int record_subpages(struct page *page, unsigned long sz,
unsigned long addr, unsigned long end,
struct page **pages)
{
struct page *start_page;
int nr;

start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT);
for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
pages[nr] = nth_page(start_page, nr);

return nr;
}

#ifdef CONFIG_ARCH_HAS_HUGEPD
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
unsigned long sz)
{
unsigned long __boundary = (addr + sz) & ~(sz-1);
return (__boundary - 1 < end - 1) ? __boundary : end;
}

static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
unsigned long pte_end;
struct page *page;
struct folio *folio;
pte_t pte;
int refs;

pte_end = (addr + sz) & ~(sz-1);
if (pte_end < end)
end = pte_end;

pte = huge_ptep_get(ptep);

if (!pte_access_permitted(pte, flags & FOLL_WRITE))
return 0;

/* hugepages are never "special" */
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));

page = pte_page(pte);
refs = record_subpages(page, sz, addr, end, pages + *nr);

folio = try_grab_folio(page, refs, flags);
if (!folio)
return 0;

if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
gup_put_folio(folio, refs, flags);
return 0;
}

if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
gup_put_folio(folio, refs, flags);
return 0;
}

*nr += refs;
folio_set_referenced(folio);
return 1;
}

/*
* NOTE: currently GUP for a hugepd is only possible on hugetlbfs file
* systems on Power, which does not have issue with folio writeback against
* GUP updates. When hugepd will be extended to support non-hugetlbfs or
* even anonymous memory, we need to do extra check as what we do with most
* of the other folios. See writable_file_mapping_allowed() and
* gup_fast_folio_allowed() for more information.
*/
static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
unsigned int pdshift, unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
pte_t *ptep;
unsigned long sz = 1UL << hugepd_shift(hugepd);
unsigned long next;

ptep = hugepte_offset(hugepd, addr, pdshift);
do {
next = hugepte_addr_end(addr, end, sz);
if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
return 0;
} while (ptep++, addr = next, addr != end);

return 1;
}
#else
static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
unsigned int pdshift, unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
return 0;
}
#endif /* CONFIG_ARCH_HAS_HUGEPD */

static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
Expand Down

0 comments on commit a12083d

Please sign in to comment.