Skip to content

Commit

Permalink
cgroup: fix handling of multi-destination migration from subtree_cont…
Browse files Browse the repository at this point in the history
…rol enabling

Consider the following v2 hierarchy.

  P0 (+memory) --- P1 (-memory) --- A
                                 \- B
       
P0 has memory enabled in its subtree_control while P1 doesn't.  If
both A and B contain processes, they would belong to the memory css of
P1.  Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter.  IOW, enabling controllers
can cause atomic migrations into different csses.

The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses.  pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.

 WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
 Modules linked in:
 CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ torvalds#29
 ...
  ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
  ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
  ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
 Call Trace:
  [<ffffffff81551ffc>] dump_stack+0x4e/0x82
  [<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
  [<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
  [<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
  [<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
  [<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
  [<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
  [<ffffffff81189016>] cgroup_attach_task+0x176/0x200
  [<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
  [<ffffffff81189684>] cgroup_procs_write+0x14/0x20
  [<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
  [<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
  [<ffffffff81265f88>] __vfs_write+0x28/0xe0
  [<ffffffff812666fc>] vfs_write+0xac/0x1a0
  [<ffffffff81267019>] SyS_write+0x49/0xb0
  [<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76

This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated.  All controllers are
updated accordingly.

* Controllers which don't care whether there are one or multiple
  target csses can be converted trivially.  cpu, io, freezer, perf,
  netclassid and netprio fall in this category.

* cpuset's current implementation assumes that there's single source
  and destination and thus doesn't support v2 hierarchy already.  The
  only change made by this patchset is how that single destination css
  is obtained.

* memory migration path already doesn't do anything on v2.  How the
  single destination css is obtained is updated and the prep stage of
  mem_cgroup_can_attach() is reordered to accomodate the change.

* pids is the only controller which was affected by this bug.  It now
  correctly handles multi-destination migrations and no longer causes
  counter underflow from incorrect accounting.

Signed-off-by: Tejun Heo <[email protected]>
Reported-and-tested-by: Daniel Wagner <[email protected]>
Cc: Aleksa Sarai <[email protected]>
  • Loading branch information
htejun committed Dec 3, 2015
1 parent 599c963 commit 1f7dd3e
Show file tree
Hide file tree
Showing 12 changed files with 137 additions and 92 deletions.
6 changes: 3 additions & 3 deletions block/blk-cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -1127,15 +1127,15 @@ void blkcg_exit_queue(struct request_queue *q)
* of the main cic data structures. For now we allow a task to change
* its cgroup only if it's the only owner of its ioc.
*/
static int blkcg_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
static int blkcg_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *dst_css;
struct io_context *ioc;
int ret = 0;

/* task_lock() is needed to avoid races with exit_io_context() */
cgroup_taskset_for_each(task, tset) {
cgroup_taskset_for_each(task, dst_css, tset) {
task_lock(task);
ioc = task->io_context;
if (ioc && atomic_read(&ioc->nr_tasks) > 1)
Expand Down
9 changes: 3 additions & 6 deletions include/linux/cgroup-defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -422,12 +422,9 @@ struct cgroup_subsys {
void (*css_reset)(struct cgroup_subsys_state *css);
void (*css_e_css_changed)(struct cgroup_subsys_state *css);

int (*can_attach)(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset);
void (*cancel_attach)(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset);
void (*attach)(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset);
int (*can_attach)(struct cgroup_taskset *tset);
void (*cancel_attach)(struct cgroup_taskset *tset);
void (*attach)(struct cgroup_taskset *tset);
int (*can_fork)(struct task_struct *task, void **priv_p);
void (*cancel_fork)(struct task_struct *task, void *priv);
void (*fork)(struct task_struct *task, void *priv);
Expand Down
33 changes: 22 additions & 11 deletions include/linux/cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,10 @@ struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state
struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *css);

struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
struct cgroup_subsys_state **dst_cssp);
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
struct cgroup_subsys_state **dst_cssp);

void css_task_iter_start(struct cgroup_subsys_state *css,
struct css_task_iter *it);
Expand Down Expand Up @@ -236,30 +238,39 @@ void css_task_iter_end(struct css_task_iter *it);
/**
* cgroup_taskset_for_each - iterate cgroup_taskset
* @task: the loop cursor
* @dst_css: the destination css
* @tset: taskset to iterate
*
* @tset may contain multiple tasks and they may belong to multiple
* processes. When there are multiple tasks in @tset, if a task of a
* process is in @tset, all tasks of the process are in @tset. Also, all
* are guaranteed to share the same source and destination csses.
* processes.
*
* On the v2 hierarchy, there may be tasks from multiple processes and they
* may not share the source or destination csses.
*
* On traditional hierarchies, when there are multiple tasks in @tset, if a
* task of a process is in @tset, all tasks of the process are in @tset.
* Also, all are guaranteed to share the same source and destination csses.
*
* Iteration is not in any specific order.
*/
#define cgroup_taskset_for_each(task, tset) \
for ((task) = cgroup_taskset_first((tset)); (task); \
(task) = cgroup_taskset_next((tset)))
#define cgroup_taskset_for_each(task, dst_css, tset) \
for ((task) = cgroup_taskset_first((tset), &(dst_css)); \
(task); \
(task) = cgroup_taskset_next((tset), &(dst_css)))

/**
* cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
* @leader: the loop cursor
* @dst_css: the destination css
* @tset: takset to iterate
*
* Iterate threadgroup leaders of @tset. For single-task migrations, @tset
* may not contain any.
*/
#define cgroup_taskset_for_each_leader(leader, tset) \
for ((leader) = cgroup_taskset_first((tset)); (leader); \
(leader) = cgroup_taskset_next((tset))) \
#define cgroup_taskset_for_each_leader(leader, dst_css, tset) \
for ((leader) = cgroup_taskset_first((tset), &(dst_css)); \
(leader); \
(leader) = cgroup_taskset_next((tset), &(dst_css))) \
if ((leader) != (leader)->group_leader) \
; \
else
Expand Down
43 changes: 34 additions & 9 deletions kernel/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -2237,6 +2237,9 @@ struct cgroup_taskset {
struct list_head src_csets;
struct list_head dst_csets;

/* the subsys currently being processed */
int ssid;

/*
* Fields for cgroup_taskset_*() iteration.
*
Expand Down Expand Up @@ -2299,25 +2302,29 @@ static void cgroup_taskset_add(struct task_struct *task,
/**
* cgroup_taskset_first - reset taskset and return the first task
* @tset: taskset of interest
* @dst_cssp: output variable for the destination css
*
* @tset iteration is initialized and the first task is returned.
*/
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
struct cgroup_subsys_state **dst_cssp)
{
tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
tset->cur_task = NULL;

return cgroup_taskset_next(tset);
return cgroup_taskset_next(tset, dst_cssp);
}

/**
* cgroup_taskset_next - iterate to the next task in taskset
* @tset: taskset of interest
* @dst_cssp: output variable for the destination css
*
* Return the next task in @tset. Iteration must have been initialized
* with cgroup_taskset_first().
*/
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
struct cgroup_subsys_state **dst_cssp)
{
struct css_set *cset = tset->cur_cset;
struct task_struct *task = tset->cur_task;
Expand All @@ -2332,6 +2339,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
if (&task->cg_list != &cset->mg_tasks) {
tset->cur_cset = cset;
tset->cur_task = task;

/*
* This function may be called both before and
* after cgroup_taskset_migrate(). The two cases
* can be distinguished by looking at whether @cset
* has its ->mg_dst_cset set.
*/
if (cset->mg_dst_cset)
*dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
else
*dst_cssp = cset->subsys[tset->ssid];

return task;
}

Expand Down Expand Up @@ -2367,7 +2386,8 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
/* check that we can legitimately attach to the cgroup */
for_each_e_css(css, i, dst_cgrp) {
if (css->ss->can_attach) {
ret = css->ss->can_attach(css, tset);
tset->ssid = i;
ret = css->ss->can_attach(tset);
if (ret) {
failed_css = css;
goto out_cancel_attach;
Expand Down Expand Up @@ -2400,9 +2420,12 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
*/
tset->csets = &tset->dst_csets;

for_each_e_css(css, i, dst_cgrp)
if (css->ss->attach)
css->ss->attach(css, tset);
for_each_e_css(css, i, dst_cgrp) {
if (css->ss->attach) {
tset->ssid = i;
css->ss->attach(tset);
}
}

ret = 0;
goto out_release_tset;
Expand All @@ -2411,8 +2434,10 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
for_each_e_css(css, i, dst_cgrp) {
if (css == failed_css)
break;
if (css->ss->cancel_attach)
css->ss->cancel_attach(css, tset);
if (css->ss->cancel_attach) {
tset->ssid = i;
css->ss->cancel_attach(tset);
}
}
out_release_tset:
spin_lock_bh(&css_set_lock);
Expand Down
6 changes: 3 additions & 3 deletions kernel/cgroup_freezer.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,10 +155,10 @@ static void freezer_css_free(struct cgroup_subsys_state *css)
* @freezer->lock. freezer_attach() makes the new tasks conform to the
* current state and all following state changes can see the new tasks.
*/
static void freezer_attach(struct cgroup_subsys_state *new_css,
struct cgroup_taskset *tset)
static void freezer_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *new_css;

mutex_lock(&freezer_mutex);

Expand All @@ -172,7 +172,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
* current state before executing the following - !frozen tasks may
* be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
*/
cgroup_taskset_for_each(task, tset) {
cgroup_taskset_for_each(task, new_css, tset) {
struct freezer *freezer = css_freezer(new_css);

if (!(freezer->state & CGROUP_FREEZING)) {
Expand Down
16 changes: 8 additions & 8 deletions kernel/cgroup_pids.c
Original file line number Diff line number Diff line change
Expand Up @@ -162,13 +162,13 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
return -EAGAIN;
}

static int pids_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
static int pids_can_attach(struct cgroup_taskset *tset)
{
struct pids_cgroup *pids = css_pids(css);
struct task_struct *task;
struct cgroup_subsys_state *dst_css;

cgroup_taskset_for_each(task, tset) {
cgroup_taskset_for_each(task, dst_css, tset) {
struct pids_cgroup *pids = css_pids(dst_css);
struct cgroup_subsys_state *old_css;
struct pids_cgroup *old_pids;

Expand All @@ -187,13 +187,13 @@ static int pids_can_attach(struct cgroup_subsys_state *css,
return 0;
}

static void pids_cancel_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
static void pids_cancel_attach(struct cgroup_taskset *tset)
{
struct pids_cgroup *pids = css_pids(css);
struct task_struct *task;
struct cgroup_subsys_state *dst_css;

cgroup_taskset_for_each(task, tset) {
cgroup_taskset_for_each(task, dst_css, tset) {
struct pids_cgroup *pids = css_pids(dst_css);
struct cgroup_subsys_state *old_css;
struct pids_cgroup *old_pids;

Expand Down
33 changes: 21 additions & 12 deletions kernel/cpuset.c
Original file line number Diff line number Diff line change
Expand Up @@ -1429,15 +1429,16 @@ static int fmeter_getrate(struct fmeter *fmp)
static struct cpuset *cpuset_attach_old_cs;

/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
struct cpuset *cs = css_cs(css);
struct cgroup_subsys_state *css;
struct cpuset *cs;
struct task_struct *task;
int ret;

/* used later by cpuset_attach() */
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
cs = css_cs(css);

mutex_lock(&cpuset_mutex);

Expand All @@ -1447,7 +1448,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;

cgroup_taskset_for_each(task, tset) {
cgroup_taskset_for_each(task, css, tset) {
ret = task_can_attach(task, cs->cpus_allowed);
if (ret)
goto out_unlock;
Expand All @@ -1467,9 +1468,14 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
return ret;
}

static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
struct cpuset *cs;

cgroup_taskset_first(tset, &css);
cs = css_cs(css);

mutex_lock(&cpuset_mutex);
css_cs(css)->attach_in_progress--;
mutex_unlock(&cpuset_mutex);
Expand All @@ -1482,16 +1488,19 @@ static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
*/
static cpumask_var_t cpus_attach;

static void cpuset_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
static void cpuset_attach(struct cgroup_taskset *tset)
{
/* static buf protected by cpuset_mutex */
static nodemask_t cpuset_attach_nodemask_to;
struct task_struct *task;
struct task_struct *leader;
struct cpuset *cs = css_cs(css);
struct cgroup_subsys_state *css;
struct cpuset *cs;
struct cpuset *oldcs = cpuset_attach_old_cs;

cgroup_taskset_first(tset, &css);
cs = css_cs(css);

mutex_lock(&cpuset_mutex);

/* prepare for attach */
Expand All @@ -1502,7 +1511,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,

guarantee_online_mems(cs, &cpuset_attach_nodemask_to);

cgroup_taskset_for_each(task, tset) {
cgroup_taskset_for_each(task, css, tset) {
/*
* can_attach beforehand should guarantee that this doesn't
* fail. TODO: have a better way to handle failure here
Expand All @@ -1518,7 +1527,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
* sleep and should be moved outside migration path proper.
*/
cpuset_attach_nodemask_to = cs->effective_mems;
cgroup_taskset_for_each_leader(leader, tset) {
cgroup_taskset_for_each_leader(leader, css, tset) {
struct mm_struct *mm = get_task_mm(leader);

if (mm) {
Expand Down
6 changes: 3 additions & 3 deletions kernel/events/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -9456,12 +9456,12 @@ static int __perf_cgroup_move(void *info)
return 0;
}

static void perf_cgroup_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
static void perf_cgroup_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;

cgroup_taskset_for_each(task, tset)
cgroup_taskset_for_each(task, css, tset)
task_function_call(task, __perf_cgroup_move, task);
}

Expand Down
12 changes: 6 additions & 6 deletions kernel/sched/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -8217,12 +8217,12 @@ static void cpu_cgroup_fork(struct task_struct *task, void *private)
sched_move_task(task);
}

static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;

cgroup_taskset_for_each(task, tset) {
cgroup_taskset_for_each(task, css, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
Expand All @@ -8235,12 +8235,12 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
return 0;
}

static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;

cgroup_taskset_for_each(task, tset)
cgroup_taskset_for_each(task, css, tset)
sched_move_task(task);
}

Expand Down
Loading

0 comments on commit 1f7dd3e

Please sign in to comment.