Skip to content

Commit

Permalink
numa: fix /proc/<pid>/numa_maps for THP
Browse files Browse the repository at this point in the history
In gather_pte_stats() a THP pmd is cast into a pte, which is wrong
because the layouts may differ depending on the architecture.  On s390
this will lead to inaccurate numa_maps accounting in /proc because of
misguided pte_present() and pte_dirty() checks on the fake pte.

On other architectures pte_present() and pte_dirty() may work by chance,
but there may be an issue with direct-access (dax) mappings w/o
underlying struct pages when HAVE_PTE_SPECIAL is set and THP is
available.  In vm_normal_page() the fake pte will be checked with
pte_special() and because there is no "special" bit in a pmd, this will
always return false and the VM_PFNMAP | VM_MIXEDMAP checking will be
skipped.  On dax mappings w/o struct pages, an invalid struct page
pointer would then be returned that can crash the kernel.

This patch fixes the numa_maps THP handling by introducing new "_pmd"
variants of the can_gather_numa_stats() and vm_normal_page() functions.

Signed-off-by: Gerald Schaefer <[email protected]>
Cc: Naoya Horiguchi <[email protected]>
Cc: "Kirill A . Shutemov" <[email protected]>
Cc: Konstantin Khlebnikov <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vlastimil Babka <[email protected]>
Cc: Jerome Marchand <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Dan Williams <[email protected]>
Cc: Martin Schwidefsky <[email protected]>
Cc: Heiko Carstens <[email protected]>
Cc: Michael Holzheu <[email protected]>
Cc: <[email protected]>	[4.3+]
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
gerald-schaefer authored and torvalds committed Apr 29, 2016
1 parent 3486b85 commit 28093f9
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 3 deletions.
33 changes: 30 additions & 3 deletions fs/proc/task_mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1518,6 +1518,32 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
return page;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
struct vm_area_struct *vma,
unsigned long addr)
{
struct page *page;
int nid;

if (!pmd_present(pmd))
return NULL;

page = vm_normal_page_pmd(vma, addr, pmd);
if (!page)
return NULL;

if (PageReserved(page))
return NULL;

nid = page_to_nid(page);
if (!node_isset(nid, node_states[N_MEMORY]))
return NULL;

return page;
}
#endif

static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
Expand All @@ -1527,21 +1553,22 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
pte_t *orig_pte;
pte_t *pte;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;

page = can_gather_numa_stats(huge_pte, vma, addr);
page = can_gather_numa_stats_pmd(*pmd, vma, addr);
if (page)
gather_stats(page, md, pte_dirty(huge_pte),
gather_stats(page, md, pmd_dirty(*pmd),
HPAGE_PMD_SIZE/PAGE_SIZE);
spin_unlock(ptl);
return 0;
}

if (pmd_trans_unstable(pmd))
return 0;
#endif
orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
do {
struct page *page = can_gather_numa_stats(*pte, vma, addr);
Expand Down
2 changes: 2 additions & 0 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -1140,6 +1140,8 @@ struct zap_details {

struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);

int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
Expand Down
40 changes: 40 additions & 0 deletions mm/memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -789,6 +789,46 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return pfn_to_page(pfn);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd)
{
unsigned long pfn = pmd_pfn(pmd);

/*
* There is no pmd_special() but there may be special pmds, e.g.
* in a direct-access (dax) mapping, so let's just replicate the
* !HAVE_PTE_SPECIAL case from vm_normal_page() here.
*/
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
if (!pfn_valid(pfn))
return NULL;
goto out;
} else {
unsigned long off;
off = (addr - vma->vm_start) >> PAGE_SHIFT;
if (pfn == vma->vm_pgoff + off)
return NULL;
if (!is_cow_mapping(vma->vm_flags))
return NULL;
}
}

if (is_zero_pfn(pfn))
return NULL;
if (unlikely(pfn > highest_memmap_pfn))
return NULL;

/*
* NOTE! We still have PageReserved() pages in the page tables.
* eg. VDSO mappings can cause them to exist.
*/
out:
return pfn_to_page(pfn);
}
#endif

/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
Expand Down

0 comments on commit 28093f9

Please sign in to comment.