Skip to content

Commit

Permalink
mm anon rmap: replace same_anon_vma linked list with an interval tree.
Browse files Browse the repository at this point in the history
When a large VMA (anon or private file mapping) is first touched, which
will populate its anon_vma field, and then split into many regions through
the use of mprotect(), the original anon_vma ends up linking all of the
vmas on a linked list.  This can cause rmap to become inefficient, as we
have to walk potentially thousands of irrelevent vmas before finding the
one a given anon page might fall into.

By replacing the same_anon_vma linked list with an interval tree (where
each avc's interval is determined by its vma's start and last pgoffs), we
can make rmap efficient for this use case again.

While the change is large, all of its pieces are fairly simple.

Most places that were walking the same_anon_vma list were looking for a
known pgoff, so they can just use the anon_vma_interval_tree_foreach()
interval tree iterator instead.  The exception here is ksm, where the
page's index is not known.  It would probably be possible to rework ksm so
that the index would be known, but for now I have decided to keep things
simple and just walk the entirety of the interval tree there.

When updating vma's that already have an anon_vma assigned, we must take
care to re-index the corresponding avc's on their interval tree.  This is
done through the use of anon_vma_interval_tree_pre_update_vma() and
anon_vma_interval_tree_post_update_vma(), which remove the avc's from
their interval tree before the update and re-insert them after the update.
 The anon_vma stays locked during the update, so there is no chance that
rmap would miss the vmas that are being updated.

Signed-off-by: Michel Lespinasse <[email protected]>
Cc: Andrea Arcangeli <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Daniel Santos <[email protected]>
Cc: Hugh Dickins <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
walken-google authored and torvalds committed Oct 9, 2012
1 parent 108d664 commit bf181b9
Show file tree
Hide file tree
Showing 8 changed files with 114 additions and 41 deletions.
14 changes: 14 additions & 0 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

struct mempolicy;
struct anon_vma;
struct anon_vma_chain;
struct file_ra_state;
struct user_struct;
struct writeback_control;
Expand Down Expand Up @@ -1377,6 +1378,19 @@ static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
list_add_tail(&vma->shared.nonlinear, list);
}

void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
struct rb_root *root);
void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
struct rb_root *root);
struct anon_vma_chain *anon_vma_interval_tree_iter_first(
struct rb_root *root, unsigned long start, unsigned long last);
struct anon_vma_chain *anon_vma_interval_tree_iter_next(
struct anon_vma_chain *node, unsigned long start, unsigned long last);

#define anon_vma_interval_tree_foreach(avc, root, start, last) \
for (avc = anon_vma_interval_tree_iter_first(root, start, last); \
avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))

/* mmap.c */
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
Expand Down
11 changes: 6 additions & 5 deletions include/linux/rmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ struct anon_vma {
atomic_t refcount;

/*
* NOTE: the LSB of the head.next is set by
* NOTE: the LSB of the rb_root.rb_node is set by
* mm_take_all_locks() _after_ taking the above lock. So the
* head must only be read/written after taking the above lock
* rb_root must only be read/written after taking the above lock
* to be sure to see a valid next pointer. The LSB bit itself
* is serialized by a system wide lock only visible to
* mm_take_all_locks() (mm_all_locks_mutex).
*/
struct list_head head; /* Chain of private "related" vmas */
struct rb_root rb_root; /* Interval tree of private "related" vmas */
};

/*
Expand All @@ -57,14 +57,15 @@ struct anon_vma {
* with a VMA, or the VMAs associated with an anon_vma.
* The "same_vma" list contains the anon_vma_chains linking
* all the anon_vmas associated with this VMA.
* The "same_anon_vma" list contains the anon_vma_chains
* The "rb" field indexes on an interval tree the anon_vma_chains
* which link all the VMAs associated with this anon_vma.
*/
struct anon_vma_chain {
struct vm_area_struct *vma;
struct anon_vma *anon_vma;
struct list_head same_vma; /* locked by mmap_sem & page_table_lock */
struct list_head same_anon_vma; /* locked by anon_vma->mutex */
struct rb_node rb; /* locked by anon_vma->mutex */
unsigned long rb_subtree_last;
};

#ifdef CONFIG_MMU
Expand Down
5 changes: 3 additions & 2 deletions mm/huge_memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -1375,13 +1375,14 @@ static void __split_huge_page(struct page *page,
struct anon_vma *anon_vma)
{
int mapcount, mapcount2;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct anon_vma_chain *avc;

BUG_ON(!PageHead(page));
BUG_ON(PageTail(page));

mapcount = 0;
list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long addr = vma_address(page, vma);
BUG_ON(is_vma_temporary_stack(vma));
Expand All @@ -1407,7 +1408,7 @@ static void __split_huge_page(struct page *page,
__split_huge_page_refcount(page);

mapcount2 = 0;
list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long addr = vma_address(page, vma);
BUG_ON(is_vma_temporary_stack(vma));
Expand Down
14 changes: 14 additions & 0 deletions mm/interval_tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/rmap.h>
#include <linux/interval_tree_generic.h>

static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
Expand Down Expand Up @@ -57,3 +58,16 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
rb_insert_augmented(&node->shared.linear.rb, root,
&vma_interval_tree_augment);
}

static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
{
return vma_start_pgoff(avc->vma);
}

static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
{
return vma_last_pgoff(avc->vma);
}

INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
avc_start_pgoff, avc_last_pgoff,, anon_vma_interval_tree)
9 changes: 6 additions & 3 deletions mm/ksm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1618,7 +1618,8 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
struct vm_area_struct *vma;

anon_vma_lock(anon_vma);
list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
Expand Down Expand Up @@ -1671,7 +1672,8 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
struct vm_area_struct *vma;

anon_vma_lock(anon_vma);
list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
Expand Down Expand Up @@ -1723,7 +1725,8 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
struct vm_area_struct *vma;

anon_vma_lock(anon_vma);
list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
Expand Down
5 changes: 4 additions & 1 deletion mm/memory-failure.c
Original file line number Diff line number Diff line change
Expand Up @@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
struct vm_area_struct *vma;
struct task_struct *tsk;
struct anon_vma *av;
pgoff_t pgoff;

av = page_lock_anon_vma(page);
if (av == NULL) /* Not actually mapped anymore */
return;

pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
read_lock(&tasklist_lock);
for_each_process (tsk) {
struct anon_vma_chain *vmac;

if (!task_early_kill(tsk))
continue;
list_for_each_entry(vmac, &av->head, same_anon_vma) {
anon_vma_interval_tree_foreach(vmac, &av->rb_root,
pgoff, pgoff) {
vma = vmac->vma;
if (!page_mapped_in_vma(page, vma))
continue;
Expand Down
73 changes: 55 additions & 18 deletions mm/mmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,38 @@ void validate_mm(struct mm_struct *mm)
#define validate_mm(mm) do { } while (0)
#endif

/*
* vma has some anon_vma assigned, and is already inserted on that
* anon_vma's interval trees.
*
* Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
* vma must be removed from the anon_vma's interval trees using
* anon_vma_interval_tree_pre_update_vma().
*
* After the update, the vma will be reinserted using
* anon_vma_interval_tree_post_update_vma().
*
* The entire update must be protected by exclusive mmap_sem and by
* the root anon_vma's mutex.
*/
static inline void
anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc;

list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
}

static inline void
anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc;

list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}

static int find_vma_links(struct mm_struct *mm, unsigned long addr,
unsigned long end, struct vm_area_struct **pprev,
struct rb_node ***rb_link, struct rb_node **rb_parent)
Expand Down Expand Up @@ -565,20 +597,17 @@ again: remove_next = 1 + (end > next->vm_end);

vma_adjust_trans_huge(vma, start, end, adjust_next);

/*
* When changing only vma->vm_end, we don't really need anon_vma
* lock. This is a fairly rare case by itself, but the anon_vma
* lock may be shared between many sibling processes. Skipping
* the lock for brk adjustments makes a difference sometimes.
*/
if (vma->anon_vma && (importer || start != vma->vm_start)) {
anon_vma = vma->anon_vma;
anon_vma = vma->anon_vma;
if (!anon_vma && adjust_next)
anon_vma = next->anon_vma;
if (anon_vma) {
VM_BUG_ON(adjust_next && next->anon_vma &&
anon_vma != next->anon_vma);
} else if (adjust_next && next->anon_vma)
anon_vma = next->anon_vma;
if (anon_vma)
anon_vma_lock(anon_vma);
anon_vma_interval_tree_pre_update_vma(vma);
if (adjust_next)
anon_vma_interval_tree_pre_update_vma(next);
}

if (root) {
flush_dcache_mmap_lock(mapping);
Expand Down Expand Up @@ -619,8 +648,12 @@ again: remove_next = 1 + (end > next->vm_end);
__insert_vm_struct(mm, insert);
}

if (anon_vma)
if (anon_vma) {
anon_vma_interval_tree_post_update_vma(vma);
if (adjust_next)
anon_vma_interval_tree_post_update_vma(next);
anon_vma_unlock(anon_vma);
}
if (mapping)
mutex_unlock(&mapping->i_mmap_mutex);

Expand Down Expand Up @@ -1748,7 +1781,9 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
anon_vma_interval_tree_post_update_vma(vma);
perf_event_mmap(vma);
}
}
Expand Down Expand Up @@ -1798,8 +1833,10 @@ int expand_downwards(struct vm_area_struct *vma,
if (grow <= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
anon_vma_interval_tree_post_update_vma(vma);
perf_event_mmap(vma);
}
}
Expand Down Expand Up @@ -2515,7 +2552,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);

static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
{
if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
/*
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
Expand All @@ -2531,7 +2568,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
* anon_vma->root->mutex.
*/
if (__test_and_set_bit(0, (unsigned long *)
&anon_vma->root->head.next))
&anon_vma->root->rb_root.rb_node))
BUG();
}
}
Expand Down Expand Up @@ -2572,7 +2609,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
* A single task can't take more than one mm_take_all_locks() in a row
* or it would deadlock.
*
* The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
* The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
* mapping->flags avoid to take the same lock twice, if more than one
* vma in this mm is backed by the same anon_vma or address_space.
*
Expand Down Expand Up @@ -2619,21 +2656,21 @@ int mm_take_all_locks(struct mm_struct *mm)

static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
{
if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
/*
* The LSB of head.next can't change to 0 from under
* us because we hold the mm_all_locks_mutex.
*
* We must however clear the bitflag before unlocking
* the vma so the users using the anon_vma->head will
* the vma so the users using the anon_vma->rb_root will
* never see our bitflag.
*
* No need of atomic instructions here, head.next
* can't change from under us until we release the
* anon_vma->root->mutex.
*/
if (!__test_and_clear_bit(0, (unsigned long *)
&anon_vma->root->head.next))
&anon_vma->root->rb_root.rb_node))
BUG();
anon_vma_unlock(anon_vma);
}
Expand Down
Loading

0 comments on commit bf181b9

Please sign in to comment.