Skip to content

Commit

Permalink
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Browse files Browse the repository at this point in the history
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level.  Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly.  Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.

This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity.  In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask.  This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.

This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out.  All other (hugepage)
pagetable walking paths can just interpret the structure as they go.

There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug.  We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping).  This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.

While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.

Signed-off-by: David Gibson <[email protected]>
Signed-off-by: Benjamin Herrenschmidt <[email protected]>
  • Loading branch information
dgibson authored and ozbenh committed Oct 30, 2009
1 parent a0668cd commit a4fe3ce
Show file tree
Hide file tree
Showing 10 changed files with 313 additions and 410 deletions.
1 change: 0 additions & 1 deletion arch/powerpc/include/asm/hugetlb.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

#include <asm/page.h>


int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
unsigned long len);

Expand Down
14 changes: 3 additions & 11 deletions arch/powerpc/include/asm/mmu-hash64.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,14 +173,6 @@ extern unsigned long tce_alloc_start, tce_alloc_end;
*/
extern int mmu_ci_restrictions;

#ifdef CONFIG_HUGETLB_PAGE
/*
* The page size indexes of the huge pages for use by hugetlbfs
*/
extern unsigned int mmu_huge_psizes[MMU_PAGE_COUNT];

#endif /* CONFIG_HUGETLB_PAGE */

/*
* This function sets the AVPN and L fields of the HPTE appropriately
* for the page size
Expand Down Expand Up @@ -254,9 +246,9 @@ extern int __hash_page_64K(unsigned long ea, unsigned long access,
unsigned int local, int ssize);
struct mm_struct;
extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap);
extern int hash_huge_page(struct mm_struct *mm, unsigned long access,
unsigned long ea, unsigned long vsid, int local,
unsigned long trap);
int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
pte_t *ptep, unsigned long trap, int local, int ssize,
unsigned int shift, unsigned int mmu_psize);

extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
unsigned long pstart, unsigned long prot,
Expand Down
14 changes: 14 additions & 0 deletions arch/powerpc/include/asm/page.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,20 @@ typedef unsigned long pgprot_t;

#endif

typedef struct { signed long pd; } hugepd_t;
#define HUGEPD_SHIFT_MASK 0x3f

#ifdef CONFIG_HUGETLB_PAGE
static inline int hugepd_ok(hugepd_t hpd)
{
return (hpd.pd > 0);
}

#define is_hugepd(pdep) (hugepd_ok(*((hugepd_t *)(pdep))))
#else /* CONFIG_HUGETLB_PAGE */
#define is_hugepd(pdep) 0
#endif /* CONFIG_HUGETLB_PAGE */

struct page;
extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg);
extern void copy_user_page(void *to, void *from, unsigned long vaddr,
Expand Down
13 changes: 12 additions & 1 deletion arch/powerpc/include/asm/pgtable-ppc64.h
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,18 @@ void pgtable_cache_init(void);
return pt;
}

pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long address);
#ifdef CONFIG_HUGETLB_PAGE
pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
unsigned *shift);
#else
static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
unsigned *shift)
{
if (shift)
*shift = 0;
return find_linux_pte(pgdir, ea);
}
#endif /* !CONFIG_HUGETLB_PAGE */

#endif /* __ASSEMBLY__ */

Expand Down
3 changes: 3 additions & 0 deletions arch/powerpc/include/asm/pgtable.h
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,9 @@ extern void paging_init(void);
*/
extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t);

extern int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr);

#endif /* __ASSEMBLY__ */

#endif /* __KERNEL__ */
Expand Down
20 changes: 5 additions & 15 deletions arch/powerpc/kernel/perf_callchain.c
Original file line number Diff line number Diff line change
Expand Up @@ -119,13 +119,6 @@ static void perf_callchain_kernel(struct pt_regs *regs,
}

#ifdef CONFIG_PPC64

#ifdef CONFIG_HUGETLB_PAGE
#define is_huge_psize(pagesize) (HPAGE_SHIFT && mmu_huge_psizes[pagesize])
#else
#define is_huge_psize(pagesize) 0
#endif

/*
* On 64-bit we don't want to invoke hash_page on user addresses from
* interrupt context, so if the access faults, we read the page tables
Expand All @@ -135,7 +128,7 @@ static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
{
pgd_t *pgdir;
pte_t *ptep, pte;
int pagesize;
unsigned shift;
unsigned long addr = (unsigned long) ptr;
unsigned long offset;
unsigned long pfn;
Expand All @@ -145,17 +138,14 @@ static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
if (!pgdir)
return -EFAULT;

pagesize = get_slice_psize(current->mm, addr);
ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift);
if (!shift)
shift = PAGE_SHIFT;

/* align address to page boundary */
offset = addr & ((1ul << mmu_psize_defs[pagesize].shift) - 1);
offset = addr & ((1UL << shift) - 1);
addr -= offset;

if (is_huge_psize(pagesize))
ptep = huge_pte_offset(current->mm, addr);
else
ptep = find_linux_pte(pgdir, addr);

if (ptep == NULL)
return -EFAULT;
pte = *ptep;
Expand Down
149 changes: 26 additions & 123 deletions arch/powerpc/mm/gup.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,57 +55,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
return 1;
}

#ifdef CONFIG_HUGETLB_PAGE
static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
unsigned long *addr, unsigned long end,
int write, struct page **pages, int *nr)
{
unsigned long mask;
unsigned long pte_end;
struct page *head, *page;
pte_t pte;
int refs;

pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
if (pte_end < end)
end = pte_end;

pte = *ptep;
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
if ((pte_val(pte) & mask) != mask)
return 0;
/* hugepages are never "special" */
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));

refs = 0;
head = pte_page(pte);
page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
(*nr)++;
page++;
refs++;
} while (*addr += PAGE_SIZE, *addr != end);

if (!page_cache_add_speculative(head, refs)) {
*nr -= refs;
return 0;
}
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
/* Could be optimized better */
while (*nr) {
put_page(page);
(*nr)--;
}
}

return 1;
}
#endif /* CONFIG_HUGETLB_PAGE */

static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
Expand All @@ -119,7 +68,11 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
next = pmd_addr_end(addr, end);
if (pmd_none(pmd))
return 0;
if (!gup_pte_range(pmd, addr, next, write, pages, nr))
if (is_hugepd(pmdp)) {
if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
addr, next, write, pages, nr))
return 0;
} else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);

Expand All @@ -139,7 +92,11 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
next = pud_addr_end(addr, end);
if (pud_none(pud))
return 0;
if (!gup_pmd_range(pud, addr, next, write, pages, nr))
if (is_hugepd(pudp)) {
if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
addr, next, write, pages, nr))
return 0;
} else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
return 0;
} while (pudp++, addr = next, addr != end);

Expand All @@ -154,10 +111,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
unsigned long next;
pgd_t *pgdp;
int nr = 0;
#ifdef CONFIG_PPC64
unsigned int shift;
int psize;
#endif

pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");

Expand All @@ -172,25 +125,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,

pr_devel(" aligned: %lx .. %lx\n", start, end);

#ifdef CONFIG_HUGETLB_PAGE
/* We bail out on slice boundary crossing when hugetlb is
* enabled in order to not have to deal with two different
* page table formats
*/
if (addr < SLICE_LOW_TOP) {
if (end > SLICE_LOW_TOP)
goto slow_irqon;

if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
GET_LOW_SLICE_INDEX(end - 1)))
goto slow_irqon;
} else {
if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
GET_HIGH_SLICE_INDEX(end - 1)))
goto slow_irqon;
}
#endif /* CONFIG_HUGETLB_PAGE */

/*
* XXX: batch / limit 'nr', to avoid large irq off latency
* needs some instrumenting to determine the common sizes used by
Expand All @@ -210,54 +144,23 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
*/
local_irq_disable();

#ifdef CONFIG_PPC64
/* Those bits are related to hugetlbfs implementation and only exist
* on 64-bit for now
*/
psize = get_slice_psize(mm, addr);
shift = mmu_psize_defs[psize].shift;
#endif /* CONFIG_PPC64 */

#ifdef CONFIG_HUGETLB_PAGE
if (unlikely(mmu_huge_psizes[psize])) {
pte_t *ptep;
unsigned long a = addr;
unsigned long sz = ((1UL) << shift);
struct hstate *hstate = size_to_hstate(sz);

BUG_ON(!hstate);
/*
* XXX: could be optimized to avoid hstate
* lookup entirely (just use shift)
*/

do {
VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
ptep = huge_pte_offset(mm, a);
pr_devel(" %016lx: huge ptep %p\n", a, ptep);
if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
&nr))
goto slow;
} while (a != end);
} else
#endif /* CONFIG_HUGETLB_PAGE */
{
pgdp = pgd_offset(mm, addr);
do {
pgd_t pgd = *pgdp;

#ifdef CONFIG_PPC64
VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
#endif
pr_devel(" %016lx: normal pgd %p\n", addr,
(void *)pgd_val(pgd));
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
pgdp = pgd_offset(mm, addr);
do {
pgd_t pgd = *pgdp;

pr_devel(" %016lx: normal pgd %p\n", addr,
(void *)pgd_val(pgd));
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
if (is_hugepd(pgdp)) {
if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
addr, next, write, pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
}
} else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);

local_irq_enable();

VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
Expand Down
26 changes: 14 additions & 12 deletions arch/powerpc/mm/hash_utils_64.c
Original file line number Diff line number Diff line change
Expand Up @@ -891,6 +891,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
unsigned long vsid;
struct mm_struct *mm;
pte_t *ptep;
unsigned hugeshift;
const struct cpumask *tmp;
int rc, user_region = 0, local = 0;
int psize, ssize;
Expand Down Expand Up @@ -943,30 +944,31 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
if (user_region && cpumask_equal(mm_cpumask(mm), tmp))
local = 1;

#ifdef CONFIG_HUGETLB_PAGE
/* Handle hugepage regions */
if (HPAGE_SHIFT && mmu_huge_psizes[psize]) {
DBG_LOW(" -> huge page !\n");
return hash_huge_page(mm, access, ea, vsid, local, trap);
}
#endif /* CONFIG_HUGETLB_PAGE */

#ifndef CONFIG_PPC_64K_PAGES
/* If we use 4K pages and our psize is not 4K, then we are hitting
* a special driver mapping, we need to align the address before
* we fetch the PTE
/* If we use 4K pages and our psize is not 4K, then we might
* be hitting a special driver mapping, and need to align the
* address before we fetch the PTE.
*
* It could also be a hugepage mapping, in which case this is
* not necessary, but it's not harmful, either.
*/
if (psize != MMU_PAGE_4K)
ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
#endif /* CONFIG_PPC_64K_PAGES */

/* Get PTE and page size from page tables */
ptep = find_linux_pte(pgdir, ea);
ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
if (ptep == NULL || !pte_present(*ptep)) {
DBG_LOW(" no PTE !\n");
return 1;
}

#ifdef CONFIG_HUGETLB_PAGE
if (hugeshift)
return __hash_page_huge(ea, access, vsid, ptep, trap, local,
ssize, hugeshift, psize);
#endif /* CONFIG_HUGETLB_PAGE */

#ifndef CONFIG_PPC_64K_PAGES
DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
#else
Expand Down
Loading

0 comments on commit a4fe3ce

Please sign in to comment.