Skip to content

Commit

Permalink
memcg: switch lruvec stats to rstat
Browse files Browse the repository at this point in the history
The commit 2d146aa ("mm: memcontrol: switch to rstat") switched memcg
stats to rstat infrastructure but skipped the conversion of the lruvec
stats as such stats are read in the performance critical code paths and
flushing stats may have impacted the performances of the applications.
This patch converts the lruvec stats to rstat and later patches add
mechanisms to keep the performance impact to minimum.

The rstat conversion comes with the price i.e.  memory cost.  Effectively
this patch reverts the savings done by the commit f3344ad ("mm:
memcontrol: optimize per-lruvec stats counter memory usage").  However
this cost is justified due to negative impact of the inaccurate lruvec
stats on many heuristics.  One such case is reported in [1].

The memory reclaim code is filled with plethora of heuristics and many of
those heuristics reads the lruvec stats.  So, inaccurate stats can make
such heuristics ineffective.  [1] reports the impact of inaccurate lruvec
stats on the "cache trim mode" heuristic.  Inaccurate lruvec stats can
impact the deactivation and aging anon heuristics as well.

[1] https://lore.kernel.org/linux-mm/[email protected]/

Link: https://lkml.kernel.org/r/[email protected]
Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Shakeel Butt <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Muchun Song <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Roman Gushchin <[email protected]>
Cc: Huang Ying <[email protected]>
Cc: Hillf Danton <[email protected]>
Cc: Michal Koutný <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
shakeelb authored and torvalds committed Sep 3, 2021
1 parent fab827d commit 7e1c0d6
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 98 deletions.
42 changes: 20 additions & 22 deletions include/linux/memcontrol.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,6 @@ struct mem_cgroup_reclaim_iter {
unsigned int generation;
};

struct lruvec_stat {
long count[NR_VM_NODE_STAT_ITEMS];
};

struct batched_lruvec_stat {
s32 count[NR_VM_NODE_STAT_ITEMS];
};

/*
* Bitmap and deferred work of shrinker::id corresponding to memcg-aware
* shrinkers, which have elements charged to this memcg.
Expand All @@ -123,24 +115,30 @@ struct shrinker_info {
unsigned long *map;
};

struct lruvec_stats_percpu {
/* Local (CPU and cgroup) state */
long state[NR_VM_NODE_STAT_ITEMS];

/* Delta calculation for lockless upward propagation */
long state_prev[NR_VM_NODE_STAT_ITEMS];
};

struct lruvec_stats {
/* Aggregated (CPU and subtree) state */
long state[NR_VM_NODE_STAT_ITEMS];

/* Pending child counts during tree propagation */
long state_pending[NR_VM_NODE_STAT_ITEMS];
};

/*
* per-node information in memory controller.
*/
struct mem_cgroup_per_node {
struct lruvec lruvec;

/*
* Legacy local VM stats. This should be struct lruvec_stat and
* cannot be optimized to struct batched_lruvec_stat. Because
* the threshold of the lruvec_stat_cpu can be as big as
* MEMCG_CHARGE_BATCH * PAGE_SIZE. It can fit into s32. But this
* filed has no upper limit.
*/
struct lruvec_stat __percpu *lruvec_stat_local;

/* Subtree VM stats (batched updates) */
struct batched_lruvec_stat __percpu *lruvec_stat_cpu;
atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS];
struct lruvec_stats_percpu __percpu *lruvec_stats_percpu;
struct lruvec_stats lruvec_stats;

unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];

Expand Down Expand Up @@ -997,7 +995,7 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
return node_page_state(lruvec_pgdat(lruvec), idx);

pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
x = atomic_long_read(&pn->lruvec_stat[idx]);
x = READ_ONCE(pn->lruvec_stats.state[idx]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
Expand All @@ -1017,7 +1015,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,

pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
for_each_possible_cpu(cpu)
x += per_cpu(pn->lruvec_stat_local->count[idx], cpu);
x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
Expand Down
114 changes: 38 additions & 76 deletions mm/memcontrol.c
Original file line number Diff line number Diff line change
Expand Up @@ -660,23 +660,11 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
return x;
}

static struct mem_cgroup_per_node *
parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
{
struct mem_cgroup *parent;

parent = parent_mem_cgroup(pn->memcg);
if (!parent)
return NULL;
return parent->nodeinfo[nid];
}

void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
long x, threshold = MEMCG_CHARGE_BATCH;

pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
Expand All @@ -685,21 +673,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
__mod_memcg_state(memcg, idx, val);

/* Update lruvec */
__this_cpu_add(pn->lruvec_stat_local->count[idx], val);

if (vmstat_item_in_bytes(idx))
threshold <<= PAGE_SHIFT;

x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
if (unlikely(abs(x) > threshold)) {
pg_data_t *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup_per_node *pi;

for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
atomic_long_add(x, &pi->lruvec_stat[idx]);
x = 0;
}
__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
}

/**
Expand Down Expand Up @@ -2278,40 +2252,13 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
mutex_unlock(&percpu_charge_mutex);
}

static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu)
{
int nid;

for_each_node(nid) {
struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
unsigned long stat[NR_VM_NODE_STAT_ITEMS];
struct batched_lruvec_stat *lstatc;
int i;

lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
stat[i] = lstatc->count[i];
lstatc->count[i] = 0;
}

do {
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
atomic_long_add(stat[i], &pn->lruvec_stat[i]);
} while ((pn = parent_nodeinfo(pn, nid)));
}
}

static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
struct mem_cgroup *memcg;

stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);

for_each_mem_cgroup(memcg)
memcg_flush_lruvec_page_state(memcg, cpu);

return 0;
}

Expand Down Expand Up @@ -5118,17 +5065,9 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return 1;

pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
GFP_KERNEL_ACCOUNT);
if (!pn->lruvec_stat_local) {
kfree(pn);
return 1;
}

pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat,
GFP_KERNEL_ACCOUNT);
if (!pn->lruvec_stat_cpu) {
free_percpu(pn->lruvec_stat_local);
pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
GFP_KERNEL_ACCOUNT);
if (!pn->lruvec_stats_percpu) {
kfree(pn);
return 1;
}
Expand All @@ -5149,8 +5088,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return;

free_percpu(pn->lruvec_stat_cpu);
free_percpu(pn->lruvec_stat_local);
free_percpu(pn->lruvec_stats_percpu);
kfree(pn);
}

Expand All @@ -5166,15 +5104,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)

static void mem_cgroup_free(struct mem_cgroup *memcg)
{
int cpu;

memcg_wb_domain_exit(memcg);
/*
* Flush percpu lruvec stats to guarantee the value
* correctness on parent's and all ancestor levels.
*/
for_each_online_cpu(cpu)
memcg_flush_lruvec_page_state(memcg, cpu);
__mem_cgroup_free(memcg);
}

Expand Down Expand Up @@ -5407,7 +5337,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
struct memcg_vmstats_percpu *statc;
long delta, v;
int i;
int i, nid;

statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);

Expand Down Expand Up @@ -5455,6 +5385,36 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (parent)
parent->vmstats.events_pending[i] += delta;
}

for_each_node_state(nid, N_MEMORY) {
struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
struct mem_cgroup_per_node *ppn = NULL;
struct lruvec_stats_percpu *lstatc;

if (parent)
ppn = parent->nodeinfo[nid];

lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);

for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
delta = pn->lruvec_stats.state_pending[i];
if (delta)
pn->lruvec_stats.state_pending[i] = 0;

v = READ_ONCE(lstatc->state[i]);
if (v != lstatc->state_prev[i]) {
delta += v - lstatc->state_prev[i];
lstatc->state_prev[i] = v;
}

if (!delta)
continue;

pn->lruvec_stats.state[i] += delta;
if (ppn)
ppn->lruvec_stats.state_pending[i] += delta;
}
}
}

#ifdef CONFIG_MMU
Expand Down Expand Up @@ -6388,6 +6348,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
int i;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);

cgroup_rstat_flush(memcg->css.cgroup);

for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
int nid;

Expand Down

0 comments on commit 7e1c0d6

Please sign in to comment.