Skip to content

Commit

Permalink
mm: account pmd page tables to the process
Browse files Browse the repository at this point in the history
Dave noticed that unprivileged process can allocate significant amount of
memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and
memory cgroup.  The trick is to allocate a lot of PMD page tables.  Linux
kernel doesn't account PMD tables to the process, only PTE.

The use-cases below use few tricks to allocate a lot of PMD page tables
while keeping VmRSS and VmPTE low.  oom_score for the process will be 0.

	#include <errno.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <unistd.h>
	#include <sys/mman.h>
	#include <sys/prctl.h>

	#define PUD_SIZE (1UL << 30)
	#define PMD_SIZE (1UL << 21)

	#define NR_PUD 130000

	int main(void)
	{
		char *addr = NULL;
		unsigned long i;

		prctl(PR_SET_THP_DISABLE);
		for (i = 0; i < NR_PUD ; i++) {
			addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ,
					MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
			if (addr == MAP_FAILED) {
				perror("mmap");
				break;
			}
			*addr = 'x';
			munmap(addr, PMD_SIZE);
			mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ,
					MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
			if (addr == MAP_FAILED)
				perror("re-mmap"), exit(1);
		}
		printf("PID %d consumed %lu KiB in PMD page tables\n",
				getpid(), i * 4096 >> 10);
		return pause();
	}

The patch addresses the issue by account PMD tables to the process the
same way we account PTE.

The main place where PMD tables is accounted is __pmd_alloc() and
free_pmd_range(). But there're few corner cases:

 - HugeTLB can share PMD page tables. The patch handles by accounting
   the table to all processes who share it.

 - x86 PAE pre-allocates few PMD tables on fork.

 - Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity
   check on exit(2).

Accounting only happens on configuration where PMD page table's level is
present (PMD is not folded).  As with nr_ptes we use per-mm counter.  The
counter value is used to calculate baseline for badness score by
oom-killer.

Signed-off-by: Kirill A. Shutemov <[email protected]>
Reported-by: Dave Hansen <[email protected]>
Cc: Hugh Dickins <[email protected]>
Reviewed-by: Cyrill Gorcunov <[email protected]>
Cc: Pavel Emelyanov <[email protected]>
Cc: David Rientjes <[email protected]>
Tested-by: Sedat Dilek <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
kiryl authored and torvalds committed Feb 12, 2015
1 parent 8aa7687 commit dc6c9a3
Show file tree
Hide file tree
Showing 11 changed files with 75 additions and 29 deletions.
12 changes: 6 additions & 6 deletions Documentation/sysctl/vm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -555,12 +555,12 @@ this is causing problems for your system/application.

oom_dump_tasks

Enables a system-wide task dump (excluding kernel threads) to be
produced when the kernel performs an OOM-killing and includes such
information as pid, uid, tgid, vm size, rss, nr_ptes, swapents,
oom_score_adj score, and name. This is helpful to determine why the
OOM killer was invoked, to identify the rogue task that caused it,
and to determine why the OOM killer chose the task it did to kill.
Enables a system-wide task dump (excluding kernel threads) to be produced
when the kernel performs an OOM-killing and includes such information as
pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj
score, and name. This is helpful to determine why the OOM killer was
invoked, to identify the rogue task that caused it, and to determine why
the OOM killer chose the task it did to kill.

If this is set to zero, this information is suppressed. On very
large systems with thousands of tasks it may not be feasible to dump
Expand Down
14 changes: 9 additions & 5 deletions arch/x86/mm/pgtable.c
Original file line number Diff line number Diff line change
Expand Up @@ -190,18 +190,19 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)

#endif /* CONFIG_X86_PAE */

static void free_pmds(pmd_t *pmds[])
static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
{
int i;

for(i = 0; i < PREALLOCATED_PMDS; i++)
if (pmds[i]) {
pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
free_page((unsigned long)pmds[i]);
mm_dec_nr_pmds(mm);
}
}

static int preallocate_pmds(pmd_t *pmds[])
static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
{
int i;
bool failed = false;
Expand All @@ -215,11 +216,13 @@ static int preallocate_pmds(pmd_t *pmds[])
pmd = NULL;
failed = true;
}
if (pmd)
mm_inc_nr_pmds(mm);
pmds[i] = pmd;
}

if (failed) {
free_pmds(pmds);
free_pmds(mm, pmds);
return -ENOMEM;
}

Expand All @@ -246,6 +249,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)

paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
pmd_free(mm, pmd);
mm_dec_nr_pmds(mm);
}
}
}
Expand Down Expand Up @@ -283,7 +287,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)

mm->pgd = pgd;

if (preallocate_pmds(pmds) != 0)
if (preallocate_pmds(mm, pmds) != 0)
goto out_free_pgd;

if (paravirt_pgd_alloc(mm) != 0)
Expand All @@ -304,7 +308,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
return pgd;

out_free_pmds:
free_pmds(pmds);
free_pmds(mm, pmds);
out_free_pgd:
free_page((unsigned long)pgd);
out:
Expand Down
9 changes: 6 additions & 3 deletions fs/proc/task_mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

void task_mem(struct seq_file *m, struct mm_struct *mm)
{
unsigned long data, text, lib, swap;
unsigned long data, text, lib, swap, ptes, pmds;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;

/*
Expand All @@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
swap = get_mm_counter(mm, MM_SWAPENTS);
ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
seq_printf(m,
"VmPeak:\t%8lu kB\n"
"VmSize:\t%8lu kB\n"
Expand All @@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
"VmExe:\t%8lu kB\n"
"VmLib:\t%8lu kB\n"
"VmPTE:\t%8lu kB\n"
"VmPMD:\t%8lu kB\n"
"VmSwap:\t%8lu kB\n",
hiwater_vm << (PAGE_SHIFT-10),
total_vm << (PAGE_SHIFT-10),
Expand All @@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
(PTRS_PER_PTE * sizeof(pte_t) *
atomic_long_read(&mm->nr_ptes)) >> 10,
ptes >> 10,
pmds >> 10,
swap << (PAGE_SHIFT-10));
}

Expand Down
24 changes: 24 additions & 0 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -1438,8 +1438,32 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
{
return 0;
}

static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
{
return 0;
}

static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}

#else
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);

static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
{
return atomic_long_read(&mm->nr_pmds);
}

static inline void mm_inc_nr_pmds(struct mm_struct *mm)
{
atomic_long_inc(&mm->nr_pmds);
}

static inline void mm_dec_nr_pmds(struct mm_struct *mm)
{
atomic_long_dec(&mm->nr_pmds);
}
#endif

int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
Expand Down
3 changes: 2 additions & 1 deletion include/linux/mm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,8 @@ struct mm_struct {
pgd_t * pgd;
atomic_t mm_users; /* How many users with user space? */
atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
atomic_long_t nr_ptes; /* Page table pages */
atomic_long_t nr_ptes; /* PTE page table pages */
atomic_long_t nr_pmds; /* PMD page table pages */
int map_count; /* number of VMAs */

spinlock_t page_table_lock; /* Protects page tables and some counters */
Expand Down
3 changes: 3 additions & 0 deletions kernel/fork.c
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
INIT_LIST_HEAD(&mm->mmlist);
mm->core_state = NULL;
atomic_long_set(&mm->nr_ptes, 0);
#ifndef __PAGETABLE_PMD_FOLDED
atomic_long_set(&mm->nr_pmds, 0);
#endif
mm->map_count = 0;
mm->locked_vm = 0;
mm->pinned_vm = 0;
Expand Down
3 changes: 2 additions & 1 deletion mm/debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ void dump_mm(const struct mm_struct *mm)
"get_unmapped_area %p\n"
#endif
"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
"pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n"
"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
"pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
Expand Down Expand Up @@ -206,6 +206,7 @@ void dump_mm(const struct mm_struct *mm)
mm->pgd, atomic_read(&mm->mm_users),
atomic_read(&mm->mm_count),
atomic_long_read((atomic_long_t *)&mm->nr_ptes),
mm_nr_pmds((struct mm_struct *)mm),
mm->map_count,
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
Expand Down
8 changes: 6 additions & 2 deletions mm/hugetlb.c
Original file line number Diff line number Diff line change
Expand Up @@ -3598,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (saddr) {
spte = huge_pte_offset(svma->vm_mm, saddr);
if (spte) {
mm_inc_nr_pmds(mm);
get_page(virt_to_page(spte));
break;
}
Expand All @@ -3609,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)

ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
spin_lock(ptl);
if (pud_none(*pud))
if (pud_none(*pud)) {
pud_populate(mm, pud,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
else
} else {
put_page(virt_to_page(spte));
mm_inc_nr_pmds(mm);
}
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
Expand Down Expand Up @@ -3644,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)

pud_clear(pud);
put_page(virt_to_page(ptep));
mm_dec_nr_pmds(mm);
*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
return 1;
}
Expand Down
15 changes: 9 additions & 6 deletions mm/memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
pmd = pmd_offset(pud, start);
pud_clear(pud);
pmd_free_tlb(tlb, pmd, start);
mm_dec_nr_pmds(tlb->mm);
}

static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
Expand Down Expand Up @@ -3322,15 +3323,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)

spin_lock(&mm->page_table_lock);
#ifndef __ARCH_HAS_4LEVEL_HACK
if (pud_present(*pud)) /* Another has populated it */
pmd_free(mm, new);
else
if (!pud_present(*pud)) {
mm_inc_nr_pmds(mm);
pud_populate(mm, pud, new);
#else
if (pgd_present(*pud)) /* Another has populated it */
} else /* Another has populated it */
pmd_free(mm, new);
else
#else
if (!pgd_present(*pud)) {
mm_inc_nr_pmds(mm);
pgd_populate(mm, pud, new);
} else /* Another has populated it */
pmd_free(mm, new);
#endif /* __ARCH_HAS_4LEVEL_HACK */
spin_unlock(&mm->page_table_lock);
return 0;
Expand Down
4 changes: 3 additions & 1 deletion mm/mmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -2853,7 +2853,9 @@ void exit_mmap(struct mm_struct *mm)
vm_unacct_memory(nr_accounted);

WARN_ON(atomic_long_read(&mm->nr_ptes) >
(FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
round_up(FIRST_USER_ADDRESS, PMD_SIZE) >> PMD_SHIFT);
WARN_ON(mm_nr_pmds(mm) >
round_up(FIRST_USER_ADDRESS, PUD_SIZE) >> PUD_SHIFT);
}

/* Insert vm structure into process list sorted by address
Expand Down
9 changes: 5 additions & 4 deletions mm/oom_kill.c
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* The baseline for the badness score is the proportion of RAM that each
* task's rss, pagetable and swap space use.
*/
points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
get_mm_counter(p->mm, MM_SWAPENTS);
points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
task_unlock(p);

/*
Expand Down Expand Up @@ -351,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
struct task_struct *p;
struct task_struct *task;

pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n");
pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n");
rcu_read_lock();
for_each_process(p) {
if (oom_unkillable_task(p, memcg, nodemask))
Expand All @@ -367,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
continue;
}

pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n",
pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
atomic_long_read(&task->mm->nr_ptes),
mm_nr_pmds(task->mm),
get_mm_counter(task->mm, MM_SWAPENTS),
task->signal->oom_score_adj, task->comm);
task_unlock(task);
Expand Down

0 comments on commit dc6c9a3

Please sign in to comment.