Skip to content

Commit

Permalink
mm/gup: fix gup_fast with dynamic page table folding
Browse files Browse the repository at this point in the history
Currently to make sure that every page table entry is read just once
gup_fast walks perform READ_ONCE and pass pXd value down to the next
gup_pXd_range function by value e.g.:

  static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
                           unsigned int flags, struct page **pages, int *nr)
  ...
          pudp = pud_offset(&p4d, addr);

This function passes a reference on that local value copy to pXd_offset,
and might get the very same pointer in return.  This happens when the
level is folded (on most arches), and that pointer should not be
iterated.

On s390 due to the fact that each task might have different 5,4 or
3-level address translation and hence different levels folded the logic
is more complex and non-iteratable pointer to a local copy leads to
severe problems.

Here is an example of what happens with gup_fast on s390, for a task
with 3-level paging, crossing a 2 GB pud boundary:

  // addr = 0x1007ffff000, end = 0x10080001000
  static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
                           unsigned int flags, struct page **pages, int *nr)
  {
        unsigned long next;
        pud_t *pudp;

        // pud_offset returns &p4d itself (a pointer to a value on stack)
        pudp = pud_offset(&p4d, addr);
        do {
                // on second iteratation reading "random" stack value
                pud_t pud = READ_ONCE(*pudp);

                // next = 0x10080000000, due to PUD_SIZE/MASK != PGDIR_SIZE/MASK on s390
                next = pud_addr_end(addr, end);
                ...
        } while (pudp++, addr = next, addr != end); // pudp++ iterating over stack

        return 1;
  }

This happens since s390 moved to common gup code with commit
d1874a0 ("s390/mm: make the pxd_offset functions more robust") and
commit 1a42010 ("s390/mm: convert to the generic
get_user_pages_fast code").

s390 tried to mimic static level folding by changing pXd_offset
primitives to always calculate top level page table offset in pgd_offset
and just return the value passed when pXd_offset has to act as folded.

What is crucial for gup_fast and what has been overlooked is that
PxD_SIZE/MASK and thus pXd_addr_end should also change correspondingly.
And the latter is not possible with dynamic folding.

To fix the issue in addition to pXd values pass original pXdp pointers
down to gup_pXd_range functions.  And introduce pXd_offset_lockless
helpers, which take an additional pXd entry value parameter.  This has
already been discussed in

  https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1

Fixes: 1a42010 ("s390/mm: convert to the generic get_user_pages_fast code")
Signed-off-by: Vasily Gorbik <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Reviewed-by: Gerald Schaefer <[email protected]>
Reviewed-by: Alexander Gordeev <[email protected]>
Reviewed-by: Jason Gunthorpe <[email protected]>
Reviewed-by: Mike Rapoport <[email protected]>
Reviewed-by: John Hubbard <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Russell King <[email protected]>
Cc: Catalin Marinas <[email protected]>
Cc: Will Deacon <[email protected]>
Cc: Michael Ellerman <[email protected]>
Cc: Benjamin Herrenschmidt <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Jeff Dike <[email protected]>
Cc: Richard Weinberger <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Arnd Bergmann <[email protected]>
Cc: Andrey Ryabinin <[email protected]>
Cc: Heiko Carstens <[email protected]>
Cc: Christian Borntraeger <[email protected]>
Cc: Claudio Imbrenda <[email protected]>
Cc: <[email protected]>	[5.2+]
Link: https://lkml.kernel.org/r/patch.git-943f1e5dcff2.your-ad-here.call-01599856292-ext-8676@work.hours
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
Vasily Gorbik authored and torvalds committed Sep 26, 2020
1 parent 8d3fe09 commit d3f7b1b
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 21 deletions.
42 changes: 30 additions & 12 deletions arch/s390/include/asm/pgtable.h
Original file line number Diff line number Diff line change
Expand Up @@ -1260,26 +1260,44 @@ static inline pgd_t *pgd_offset_raw(pgd_t *pgd, unsigned long address)

#define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address)

static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long address)
{
if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
return (p4d_t *) pgd_deref(*pgd) + p4d_index(address);
return (p4d_t *) pgd;
if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
return (p4d_t *) pgd_deref(pgd) + p4d_index(address);
return (p4d_t *) pgdp;
}
#define p4d_offset_lockless p4d_offset_lockless

static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address)
{
if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
return (pud_t *) p4d_deref(*p4d) + pud_index(address);
return (pud_t *) p4d;
return p4d_offset_lockless(pgdp, *pgdp, address);
}

static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long address)
{
if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
return (pud_t *) p4d_deref(p4d) + pud_index(address);
return (pud_t *) p4dp;
}
#define pud_offset_lockless pud_offset_lockless

static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long address)
{
return pud_offset_lockless(p4dp, *p4dp, address);
}
#define pud_offset pud_offset

static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
static inline pmd_t *pmd_offset_lockless(pud_t *pudp, pud_t pud, unsigned long address)
{
if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3)
return (pmd_t *) pud_deref(pud) + pmd_index(address);
return (pmd_t *) pudp;
}
#define pmd_offset_lockless pmd_offset_lockless

static inline pmd_t *pmd_offset(pud_t *pudp, unsigned long address)
{
if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3)
return (pmd_t *) pud_deref(*pud) + pmd_index(address);
return (pmd_t *) pud;
return pmd_offset_lockless(pudp, *pudp, address);
}
#define pmd_offset pmd_offset

Expand Down
10 changes: 10 additions & 0 deletions include/linux/pgtable.h
Original file line number Diff line number Diff line change
Expand Up @@ -1427,6 +1427,16 @@ typedef unsigned int pgtbl_mod_mask;
#define mm_pmd_folded(mm) __is_defined(__PAGETABLE_PMD_FOLDED)
#endif

#ifndef p4d_offset_lockless
#define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address)
#endif
#ifndef pud_offset_lockless
#define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address)
#endif
#ifndef pmd_offset_lockless
#define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address)
#endif

/*
* p?d_leaf() - true if this entry is a final mapping to a physical address.
* This differs from p?d_huge() by the fact that they are always available (if
Expand Down
18 changes: 9 additions & 9 deletions mm/gup.c
Original file line number Diff line number Diff line change
Expand Up @@ -2485,13 +2485,13 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 1;
}

static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pmd_t *pmdp;

pmdp = pmd_offset(&pud, addr);
pmdp = pmd_offset_lockless(pudp, pud, addr);
do {
pmd_t pmd = READ_ONCE(*pmdp);

Expand Down Expand Up @@ -2528,13 +2528,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
return 1;
}

static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;

pudp = pud_offset(&p4d, addr);
pudp = pud_offset_lockless(p4dp, p4d, addr);
do {
pud_t pud = READ_ONCE(*pudp);

Expand All @@ -2549,20 +2549,20 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
PUD_SHIFT, next, flags, pages, nr))
return 0;
} else if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
} else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
return 0;
} while (pudp++, addr = next, addr != end);

return 1;
}

static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
p4d_t *p4dp;

p4dp = p4d_offset(&pgd, addr);
p4dp = p4d_offset_lockless(pgdp, pgd, addr);
do {
p4d_t p4d = READ_ONCE(*p4dp);

Expand All @@ -2574,7 +2574,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
P4D_SHIFT, next, flags, pages, nr))
return 0;
} else if (!gup_pud_range(p4d, addr, next, flags, pages, nr))
} else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
return 0;
} while (p4dp++, addr = next, addr != end);

Expand Down Expand Up @@ -2602,7 +2602,7 @@ static void gup_pgd_range(unsigned long addr, unsigned long end,
if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
PGDIR_SHIFT, next, flags, pages, nr))
return;
} else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr))
} else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
return;
} while (pgdp++, addr = next, addr != end);
}
Expand Down

0 comments on commit d3f7b1b

Please sign in to comment.