Skip to content

Commit

Permalink
mm: vmscan: detect file thrashing at the reclaim root
Browse files Browse the repository at this point in the history
We use refault information to determine whether the cache workingset is
stable or transitioning, and dynamically adjust the inactive:active file
LRU ratio so as to maximize protection from one-off cache during stable
periods, and minimize IO during transitions.

With cgroups and their nested LRU lists, we currently don't do this
correctly.  While recursive cgroup reclaim establishes a relative LRU
order among the pages of all involved cgroups, refaults only affect the
local LRU order in the cgroup in which they are occuring.  As a result,
cache transitions can take longer in a cgrouped system as the active pages
of sibling cgroups aren't challenged when they should be.

[ Right now, this is somewhat theoretical, because the siblings, under
  continued regular reclaim pressure, should eventually run out of
  inactive pages - and since inactive:active *size* balancing is also
  done on a cgroup-local level, we will challenge the active pages
  eventually in most cases. But the next patch will move that relative
  size enforcement to the reclaim root as well, and then this patch
  here will be necessary to propagate refault pressure to siblings. ]

This patch moves refault detection to the root of reclaim.  Instead of
remembering the cgroup owner of an evicted page, remember the cgroup that
caused the reclaim to happen.  When refaults later occur, they'll
correctly influence the cross-cgroup LRU order that reclaim follows.

I.e.  if global reclaim kicked out pages in some subgroup A/B/C, the
refault of those pages will challenge the global LRU order, and not just
the local order down inside C.

[[email protected]:  use page_memcg() instead of another lookup]
  Link: http://lkml.kernel.org/r/[email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Johannes Weiner <[email protected]>
Reviewed-by: Suren Baghdasaryan <[email protected]>
Cc: Andrey Ryabinin <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Shakeel Butt <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
hnaz authored and torvalds committed Dec 1, 2019
1 parent 53138ce commit b910718
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 33 deletions.
5 changes: 5 additions & 0 deletions include/linux/memcontrol.h
Original file line number Diff line number Diff line change
Expand Up @@ -901,6 +901,11 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
return &pgdat->__lruvec;
}

static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
return NULL;
}

static inline bool mm_match_cgroup(struct mm_struct *mm,
struct mem_cgroup *memcg)
{
Expand Down
2 changes: 1 addition & 1 deletion include/linux/swap.h
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ struct vma_swap_readahead {
};

/* linux/mm/workingset.c */
void *workingset_eviction(struct page *page);
void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg);
void workingset_refault(struct page *page, void *shadow);
void workingset_activation(struct page *page);

Expand Down
32 changes: 15 additions & 17 deletions mm/vmscan.c
Original file line number Diff line number Diff line change
Expand Up @@ -853,7 +853,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
* gets returned with a refcount of 0.
*/
static int __remove_mapping(struct address_space *mapping, struct page *page,
bool reclaimed)
bool reclaimed, struct mem_cgroup *target_memcg)
{
unsigned long flags;
int refcount;
Expand Down Expand Up @@ -925,7 +925,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
*/
if (reclaimed && page_is_file_cache(page) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(page);
shadow = workingset_eviction(page, target_memcg);
__delete_from_page_cache(page, shadow);
xa_unlock_irqrestore(&mapping->i_pages, flags);

Expand All @@ -948,7 +948,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
*/
int remove_mapping(struct address_space *mapping, struct page *page)
{
if (__remove_mapping(mapping, page, false)) {
if (__remove_mapping(mapping, page, false, NULL)) {
/*
* Unfreezing the refcount with 1 rather than 2 effectively
* drops the pagecache ref for us without requiring another
Expand Down Expand Up @@ -1426,7 +1426,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,

count_vm_event(PGLAZYFREED);
count_memcg_page_event(page, PGLAZYFREED);
} else if (!mapping || !__remove_mapping(mapping, page, true))
} else if (!mapping || !__remove_mapping(mapping, page, true,
sc->target_mem_cgroup))
goto keep_locked;

unlock_page(page);
Expand Down Expand Up @@ -2189,6 +2190,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
enum lru_list inactive_lru = file * LRU_FILE;
unsigned long inactive, active;
unsigned long inactive_ratio;
struct lruvec *target_lruvec;
unsigned long refaults;
unsigned long gb;

Expand All @@ -2200,8 +2202,9 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
* is being established. Disable active list protection to get
* rid of the stale workingset quickly.
*/
refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
if (file && lruvec->refaults != refaults) {
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE);
if (file && target_lruvec->refaults != refaults) {
inactive_ratio = 0;
} else {
gb = (inactive + active) >> (30 - PAGE_SHIFT);
Expand Down Expand Up @@ -2973,19 +2976,14 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
sc->gfp_mask = orig_mask;
}

static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
{
struct mem_cgroup *memcg;

memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
do {
unsigned long refaults;
struct lruvec *lruvec;
struct lruvec *target_lruvec;
unsigned long refaults;

lruvec = mem_cgroup_lruvec(memcg, pgdat);
refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
lruvec->refaults = refaults;
} while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE);
target_lruvec->refaults = refaults;
}

/*
Expand Down
67 changes: 52 additions & 15 deletions mm/workingset.c
Original file line number Diff line number Diff line change
Expand Up @@ -213,28 +213,53 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*workingsetp = workingset;
}

static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat)
{
/*
* Reclaiming a cgroup means reclaiming all its children in a
* round-robin fashion. That means that each cgroup has an LRU
* order that is composed of the LRU orders of its child
* cgroups; and every page has an LRU position not just in the
* cgroup that owns it, but in all of that group's ancestors.
*
* So when the physical inactive list of a leaf cgroup ages,
* the virtual inactive lists of all its parents, including
* the root cgroup's, age as well.
*/
do {
struct lruvec *lruvec;

lruvec = mem_cgroup_lruvec(memcg, pgdat);
atomic_long_inc(&lruvec->inactive_age);
} while (memcg && (memcg = parent_mem_cgroup(memcg)));
}

/**
* workingset_eviction - note the eviction of a page from memory
* @target_memcg: the cgroup that is causing the reclaim
* @page: the page being evicted
*
* Returns a shadow entry to be stored in @page->mapping->i_pages in place
* of the evicted @page so that a later refault can be detected.
*/
void *workingset_eviction(struct page *page)
void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
{
struct pglist_data *pgdat = page_pgdat(page);
struct mem_cgroup *memcg = page_memcg(page);
int memcgid = mem_cgroup_id(memcg);
unsigned long eviction;
struct lruvec *lruvec;
int memcgid;

/* Page is fully exclusive and pins page->mem_cgroup */
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);

lruvec = mem_cgroup_lruvec(memcg, pgdat);
eviction = atomic_long_inc_return(&lruvec->inactive_age);
advance_inactive_age(page_memcg(page), pgdat);

lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
eviction = atomic_long_read(&lruvec->inactive_age);
return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
}

Expand All @@ -244,10 +269,13 @@ void *workingset_eviction(struct page *page)
* @shadow: shadow entry of the evicted page
*
* Calculates and evaluates the refault distance of the previously
* evicted page in the context of the node it was allocated in.
* evicted page in the context of the node and the memcg whose memory
* pressure caused the eviction.
*/
void workingset_refault(struct page *page, void *shadow)
{
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
unsigned long refault_distance;
struct pglist_data *pgdat;
unsigned long active_file;
Expand Down Expand Up @@ -277,12 +305,12 @@ void workingset_refault(struct page *page, void *shadow)
* would be better if the root_mem_cgroup existed in all
* configurations instead.
*/
memcg = mem_cgroup_from_id(memcgid);
if (!mem_cgroup_disabled() && !memcg)
eviction_memcg = mem_cgroup_from_id(memcgid);
if (!mem_cgroup_disabled() && !eviction_memcg)
goto out;
lruvec = mem_cgroup_lruvec(memcg, pgdat);
refault = atomic_long_read(&lruvec->inactive_age);
active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
refault = atomic_long_read(&eviction_lruvec->inactive_age);
active_file = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);

/*
* Calculate the refault distance
Expand All @@ -302,6 +330,17 @@ void workingset_refault(struct page *page, void *shadow)
*/
refault_distance = (refault - eviction) & EVICTION_MASK;

/*
* The activation decision for this page is made at the level
* where the eviction occurred, as that is where the LRU order
* during page reclaim is being determined.
*
* However, the cgroup that will own the page is the one that
* is actually experiencing the refault event.
*/
memcg = page_memcg(page);
lruvec = mem_cgroup_lruvec(memcg, pgdat);

inc_lruvec_state(lruvec, WORKINGSET_REFAULT);

/*
Expand All @@ -313,7 +352,7 @@ void workingset_refault(struct page *page, void *shadow)
goto out;

SetPageActive(page);
atomic_long_inc(&lruvec->inactive_age);
advance_inactive_age(memcg, pgdat);
inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);

/* Page was active prior to eviction */
Expand All @@ -332,7 +371,6 @@ void workingset_refault(struct page *page, void *shadow)
void workingset_activation(struct page *page)
{
struct mem_cgroup *memcg;
struct lruvec *lruvec;

rcu_read_lock();
/*
Expand All @@ -345,8 +383,7 @@ void workingset_activation(struct page *page)
memcg = page_memcg_rcu(page);
if (!mem_cgroup_disabled() && !memcg)
goto out;
lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
atomic_long_inc(&lruvec->inactive_age);
advance_inactive_age(memcg, page_pgdat(page));
out:
rcu_read_unlock();
}
Expand Down

0 comments on commit b910718

Please sign in to comment.