Skip to content

Commit

Permalink
mm: per-thread vma caching
Browse files Browse the repository at this point in the history
This patch is a continuation of efforts trying to optimize find_vma(),
avoiding potentially expensive rbtree walks to locate a vma upon faults.
The original approach (https://lkml.org/lkml/2013/11/1/410), where the
largest vma was also cached, ended up being too specific and random,
thus further comparison with other approaches were needed.  There are
two things to consider when dealing with this, the cache hit rate and
the latency of find_vma().  Improving the hit-rate does not necessarily
translate in finding the vma any faster, as the overhead of any fancy
caching schemes can be too high to consider.

We currently cache the last used vma for the whole address space, which
provides a nice optimization, reducing the total cycles in find_vma() by
up to 250%, for workloads with good locality.  On the other hand, this
simple scheme is pretty much useless for workloads with poor locality.
Analyzing ebizzy runs shows that, no matter how many threads are
running, the mmap_cache hit rate is less than 2%, and in many situations
below 1%.

The proposed approach is to replace this scheme with a small per-thread
cache, maximizing hit rates at a very low maintenance cost.
Invalidations are performed by simply bumping up a 32-bit sequence
number.  The only expensive operation is in the rare case of a seq
number overflow, where all caches that share the same address space are
flushed.  Upon a miss, the proposed replacement policy is based on the
page number that contains the virtual address in question.  Concretely,
the following results are seen on an 80 core, 8 socket x86-64 box:

1) System bootup: Most programs are single threaded, so the per-thread
   scheme does improve ~50% hit rate by just adding a few more slots to
   the cache.

+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline       | 50.61%   | 19.90            |
| patched        | 73.45%   | 13.58            |
+----------------+----------+------------------+

2) Kernel build: This one is already pretty good with the current
   approach as we're dealing with good locality.

+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline       | 75.28%   | 11.03            |
| patched        | 88.09%   | 9.31             |
+----------------+----------+------------------+

3) Oracle 11g Data Mining (4k pages): Similar to the kernel build workload.

+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline       | 70.66%   | 17.14            |
| patched        | 91.15%   | 12.57            |
+----------------+----------+------------------+

4) Ebizzy: There's a fair amount of variation from run to run, but this
   approach always shows nearly perfect hit rates, while baseline is just
   about non-existent.  The amounts of cycles can fluctuate between
   anywhere from ~60 to ~116 for the baseline scheme, but this approach
   reduces it considerably.  For instance, with 80 threads:

+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline       | 1.06%    | 91.54            |
| patched        | 99.97%   | 14.18            |
+----------------+----------+------------------+

[[email protected]: fix nommu build, per Davidlohr]
[[email protected]: document vmacache_valid() logic]
[[email protected]: attempt to untangle header files]
[[email protected]: add vmacache_find() BUG_ON]
[[email protected]: add vmacache_valid_mm() (from Oleg)]
[[email protected]: coding-style fixes]
[[email protected]: adjust and enhance comments]
Signed-off-by: Davidlohr Bueso <[email protected]>
Reviewed-by: Rik van Riel <[email protected]>
Acked-by: Linus Torvalds <[email protected]>
Reviewed-by: Michel Lespinasse <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Tested-by: Hugh Dickins <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
Davidlohr Bueso authored and torvalds committed Apr 7, 2014
1 parent d7c1755 commit 615d6e8
Show file tree
Hide file tree
Showing 12 changed files with 231 additions and 44 deletions.
4 changes: 3 additions & 1 deletion arch/unicore32/include/asm/mmu_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

#include <linux/compiler.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/io.h>

#include <asm/cacheflush.h>
Expand Down Expand Up @@ -73,7 +75,7 @@ do { \
else \
mm->mmap = NULL; \
rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
mm->mmap_cache = NULL; \
vmacache_invalidate(mm); \
mm->map_count--; \
remove_vma(high_vma); \
} \
Expand Down
5 changes: 4 additions & 1 deletion fs/exec.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/swap.h>
Expand Down Expand Up @@ -822,7 +823,7 @@ EXPORT_SYMBOL(read_code);
static int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
struct mm_struct * old_mm, *active_mm;
struct mm_struct *old_mm, *active_mm;

/* Notify parent that we're no longer interested in the old VM */
tsk = current;
Expand All @@ -848,6 +849,8 @@ static int exec_mmap(struct mm_struct *mm)
tsk->mm = mm;
tsk->active_mm = mm;
activate_mm(active_mm, mm);
tsk->mm->vmacache_seqnum = 0;
vmacache_flush(tsk);
task_unlock(tsk);
if (old_mm) {
up_read(&old_mm->mmap_sem);
Expand Down
3 changes: 2 additions & 1 deletion fs/proc/task_mmu.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/mount.h>
Expand Down Expand Up @@ -152,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)

/*
* We remember last_addr rather than next_addr to hit with
* mmap_cache most of the time. We have zero last_addr at
* vmacache most of the time. We have zero last_addr at
* the beginning and also after lseek. We will have -1 last_addr
* after the end of the vmas.
*/
Expand Down
4 changes: 2 additions & 2 deletions include/linux/mm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -342,9 +342,9 @@ struct mm_rss_stat {

struct kioctx_table;
struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */
struct vm_area_struct *mmap; /* list of VMAs */
struct rb_root mm_rb;
struct vm_area_struct * mmap_cache; /* last find_vma result */
u32 vmacache_seqnum; /* per-thread vmacache */
#ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
Expand Down
7 changes: 7 additions & 0 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ struct perf_event_context;
struct blk_plug;
struct filename;

#define VMACACHE_BITS 2
#define VMACACHE_SIZE (1U << VMACACHE_BITS)
#define VMACACHE_MASK (VMACACHE_SIZE - 1)

/*
* List of flags we want to share for kernel threads,
* if only because they are not used by them anyway.
Expand Down Expand Up @@ -1235,6 +1239,9 @@ struct task_struct {
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
#endif
/* per-thread vma caching */
u32 vmacache_seqnum;
struct vm_area_struct *vmacache[VMACACHE_SIZE];
#if defined(SPLIT_RSS_COUNTING)
struct task_rss_stat rss_stat;
#endif
Expand Down
38 changes: 38 additions & 0 deletions include/linux/vmacache.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#ifndef __LINUX_VMACACHE_H
#define __LINUX_VMACACHE_H

#include <linux/sched.h>
#include <linux/mm.h>

/*
* Hash based on the page number. Provides a good hit rate for
* workloads with good locality and those with random accesses as well.
*/
#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK)

static inline void vmacache_flush(struct task_struct *tsk)
{
memset(tsk->vmacache, 0, sizeof(tsk->vmacache));
}

extern void vmacache_flush_all(struct mm_struct *mm);
extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
extern struct vm_area_struct *vmacache_find(struct mm_struct *mm,
unsigned long addr);

#ifndef CONFIG_MMU
extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
unsigned long start,
unsigned long end);
#endif

static inline void vmacache_invalidate(struct mm_struct *mm)
{
mm->vmacache_seqnum++;

/* deal with overflows */
if (unlikely(mm->vmacache_seqnum == 0))
vmacache_flush_all(mm);
}

#endif /* __LINUX_VMACACHE_H */
14 changes: 11 additions & 3 deletions kernel/debug/debug_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/rcupdate.h>

#include <asm/cacheflush.h>
Expand Down Expand Up @@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
if (!CACHE_FLUSH_IS_SAFE)
return;

if (current->mm && current->mm->mmap_cache) {
flush_cache_range(current->mm->mmap_cache,
addr, addr + BREAK_INSTR_SIZE);
if (current->mm) {
int i;

for (i = 0; i < VMACACHE_SIZE; i++) {
if (!current->vmacache[i])
continue;
flush_cache_range(current->vmacache[i],
addr, addr + BREAK_INSTR_SIZE);
}
}

/* Force flush instruction cache if it was outside the mm */
flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
}
Expand Down
7 changes: 6 additions & 1 deletion kernel/fork.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
Expand Down Expand Up @@ -364,7 +366,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)

mm->locked_vm = 0;
mm->mmap = NULL;
mm->mmap_cache = NULL;
mm->vmacache_seqnum = 0;
mm->map_count = 0;
cpumask_clear(mm_cpumask(mm));
mm->mm_rb = RB_ROOT;
Expand Down Expand Up @@ -882,6 +884,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
if (!oldmm)
return 0;

/* initialize the new vmacache entries */
vmacache_flush(tsk);

if (clone_flags & CLONE_VM) {
atomic_inc(&oldmm->mm_users);
mm = oldmm;
Expand Down
2 changes: 1 addition & 1 deletion mm/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
compaction.o balloon_compaction.o \
compaction.o balloon_compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o $(mmu-y)

obj-y += init-mm.o
Expand Down
55 changes: 29 additions & 26 deletions mm/mmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
Expand Down Expand Up @@ -681,8 +682,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
prev->vm_next = next = vma->vm_next;
if (next)
next->vm_prev = prev;
if (mm->mmap_cache == vma)
mm->mmap_cache = prev;

/* Kill the cache */
vmacache_invalidate(mm);
}

/*
Expand Down Expand Up @@ -1989,34 +1991,33 @@ EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma = NULL;
struct rb_node *rb_node;
struct vm_area_struct *vma;

/* Check the cache first. */
/* (Cache hit rate is typically around 35%.) */
vma = ACCESS_ONCE(mm->mmap_cache);
if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
struct rb_node *rb_node;
vma = vmacache_find(mm, addr);
if (likely(vma))
return vma;

rb_node = mm->mm_rb.rb_node;
vma = NULL;
rb_node = mm->mm_rb.rb_node;
vma = NULL;

while (rb_node) {
struct vm_area_struct *vma_tmp;

vma_tmp = rb_entry(rb_node,
struct vm_area_struct, vm_rb);

if (vma_tmp->vm_end > addr) {
vma = vma_tmp;
if (vma_tmp->vm_start <= addr)
break;
rb_node = rb_node->rb_left;
} else
rb_node = rb_node->rb_right;
}
if (vma)
mm->mmap_cache = vma;
while (rb_node) {
struct vm_area_struct *tmp;

tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);

if (tmp->vm_end > addr) {
vma = tmp;
if (tmp->vm_start <= addr)
break;
rb_node = rb_node->rb_left;
} else
rb_node = rb_node->rb_right;
}

if (vma)
vmacache_update(addr, vma);
return vma;
}

Expand Down Expand Up @@ -2388,7 +2389,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
} else
mm->highest_vm_end = prev ? prev->vm_end : 0;
tail_vma->vm_next = NULL;
mm->mmap_cache = NULL; /* Kill the cache. */

/* Kill the cache */
vmacache_invalidate(mm);
}

/*
Expand Down
24 changes: 16 additions & 8 deletions mm/nommu.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include <linux/export.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/file.h>
Expand Down Expand Up @@ -768,16 +769,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
*/
static void delete_vma_from_mm(struct vm_area_struct *vma)
{
int i;
struct address_space *mapping;
struct mm_struct *mm = vma->vm_mm;
struct task_struct *curr = current;

kenter("%p", vma);

protect_vma(vma, 0);

mm->map_count--;
if (mm->mmap_cache == vma)
mm->mmap_cache = NULL;
for (i = 0; i < VMACACHE_SIZE; i++) {
/* if the vma is cached, invalidate the entire cache */
if (curr->vmacache[i] == vma) {
vmacache_invalidate(curr->mm);
break;
}
}

/* remove the VMA from the mapping */
if (vma->vm_file) {
Expand Down Expand Up @@ -825,8 +833,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
struct vm_area_struct *vma;

/* check the cache first */
vma = ACCESS_ONCE(mm->mmap_cache);
if (vma && vma->vm_start <= addr && vma->vm_end > addr)
vma = vmacache_find(mm, addr);
if (likely(vma))
return vma;

/* trawl the list (there may be multiple mappings in which addr
Expand All @@ -835,7 +843,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
if (vma->vm_start > addr)
return NULL;
if (vma->vm_end > addr) {
mm->mmap_cache = vma;
vmacache_update(addr, vma);
return vma;
}
}
Expand Down Expand Up @@ -874,8 +882,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
unsigned long end = addr + len;

/* check the cache first */
vma = mm->mmap_cache;
if (vma && vma->vm_start == addr && vma->vm_end == end)
vma = vmacache_find_exact(mm, addr, end);
if (vma)
return vma;

/* trawl the list (there may be multiple mappings in which addr
Expand All @@ -886,7 +894,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
if (vma->vm_start > addr)
return NULL;
if (vma->vm_end == end) {
mm->mmap_cache = vma;
vmacache_update(addr, vma);
return vma;
}
}
Expand Down
Loading

0 comments on commit 615d6e8

Please sign in to comment.