Skip to content

Commit

Permalink
Merge branch 'for-5.2-fixes' of git://git.kernel.org/pub/scm/linux/ke…
Browse files Browse the repository at this point in the history
…rnel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:
 "This has an unusually high density of tricky fixes:

   - task_get_css() could deadlock when it races against a dying cgroup.

   - cgroup.procs didn't list thread group leaders with live threads.

     This could mislead readers to think that a cgroup is empty when
     it's not. Fixed by making PROCS iterator include dead tasks. I made
     a couple mistakes making this change and this pull request contains
     a couple follow-up patches.

   - When cpusets run out of online cpus, it updates cpusmasks of member
     tasks in bizarre ways. Joel improved the behavior significantly"

* 'for-5.2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cpuset: restore sanity to cpuset_cpus_allowed_fallback()
  cgroup: Fix css_task_iter_advance_css_set() cset skip condition
  cgroup: css_task_iter_skip()'d iterators must be advanced before accessed
  cgroup: Include dying leaders with live threads in PROCS iterations
  cgroup: Implement css_task_iter_skip()
  cgroup: Call cgroup_release() before __exit_signal()
  docs cgroups: add another example size for hugetlb
  cgroup: Use css_tryget() instead of css_tryget_online() in task_get_css()
  • Loading branch information
torvalds committed Jun 15, 2019
2 parents 6aa7a22 + d477f8c commit 0011572
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 43 deletions.
22 changes: 13 additions & 9 deletions Documentation/cgroup-v1/hugetlb.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,18 @@ Brief summary of control files
hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb
hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit

For a system supporting two hugepage size (16M and 16G) the control
For a system supporting three hugepage sizes (64k, 32M and 1G), the control
files include:

hugetlb.16GB.limit_in_bytes
hugetlb.16GB.max_usage_in_bytes
hugetlb.16GB.usage_in_bytes
hugetlb.16GB.failcnt
hugetlb.16MB.limit_in_bytes
hugetlb.16MB.max_usage_in_bytes
hugetlb.16MB.usage_in_bytes
hugetlb.16MB.failcnt
hugetlb.1GB.limit_in_bytes
hugetlb.1GB.max_usage_in_bytes
hugetlb.1GB.usage_in_bytes
hugetlb.1GB.failcnt
hugetlb.64KB.limit_in_bytes
hugetlb.64KB.max_usage_in_bytes
hugetlb.64KB.usage_in_bytes
hugetlb.64KB.failcnt
hugetlb.32MB.limit_in_bytes
hugetlb.32MB.max_usage_in_bytes
hugetlb.32MB.usage_in_bytes
hugetlb.32MB.failcnt
1 change: 1 addition & 0 deletions include/linux/cgroup-defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ struct css_set {
*/
struct list_head tasks;
struct list_head mg_tasks;
struct list_head dying_tasks;

/* all css_task_iters currently walking this cset */
struct list_head task_iters;
Expand Down
14 changes: 12 additions & 2 deletions include/linux/cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@
/* walk all threaded css_sets in the domain */
#define CSS_TASK_ITER_THREADED (1U << 1)

/* internal flags */
#define CSS_TASK_ITER_SKIPPED (1U << 16)

/* a css_task_iter should be treated as an opaque object */
struct css_task_iter {
struct cgroup_subsys *ss;
Expand All @@ -57,6 +60,7 @@ struct css_task_iter {
struct list_head *task_pos;
struct list_head *tasks_head;
struct list_head *mg_tasks_head;
struct list_head *dying_tasks_head;

struct css_set *cur_cset;
struct css_set *cur_dcset;
Expand Down Expand Up @@ -487,7 +491,7 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
*
* Find the css for the (@task, @subsys_id) combination, increment a
* reference on and return it. This function is guaranteed to return a
* valid css.
* valid css. The returned css may already have been offlined.
*/
static inline struct cgroup_subsys_state *
task_get_css(struct task_struct *task, int subsys_id)
Expand All @@ -497,7 +501,13 @@ task_get_css(struct task_struct *task, int subsys_id)
rcu_read_lock();
while (true) {
css = task_css(task, subsys_id);
if (likely(css_tryget_online(css)))
/*
* Can't use css_tryget_online() here. A task which has
* PF_EXITING set may stay associated with an offline css.
* If such task calls this function, css_tryget_online()
* will keep failing.
*/
if (likely(css_tryget(css)))
break;
cpu_relax();
}
Expand Down
106 changes: 76 additions & 30 deletions kernel/cgroup/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,8 @@ static struct cftype cgroup_base_files[];

static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_advance(struct css_task_iter *it);
static void css_task_iter_skip(struct css_task_iter *it,
struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
struct cgroup_subsys *ss);
Expand Down Expand Up @@ -738,6 +739,7 @@ struct css_set init_css_set = {
.dom_cset = &init_css_set,
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
Expand Down Expand Up @@ -843,6 +845,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
cgroup_update_populated(link->cgrp, populated);
}

/*
* @task is leaving, advance task iterators which are pointing to it so
* that they can resume at the next position. Advancing an iterator might
* remove it from the list, use safe walk. See css_task_iter_skip() for
* details.
*/
static void css_set_skip_task_iters(struct css_set *cset,
struct task_struct *task)
{
struct css_task_iter *it, *pos;

list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
css_task_iter_skip(it, task);
}

/**
* css_set_move_task - move a task from one css_set to another
* @task: task being moved
Expand All @@ -868,22 +885,9 @@ static void css_set_move_task(struct task_struct *task,
css_set_update_populated(to_cset, true);

if (from_cset) {
struct css_task_iter *it, *pos;

WARN_ON_ONCE(list_empty(&task->cg_list));

/*
* @task is leaving, advance task iterators which are
* pointing to it so that they can resume at the next
* position. Advancing an iterator might remove it from
* the list, use safe walk. See css_task_iter_advance*()
* for details.
*/
list_for_each_entry_safe(it, pos, &from_cset->task_iters,
iters_node)
if (it->task_pos == &task->cg_list)
css_task_iter_advance(it);

css_set_skip_task_iters(from_cset, task);
list_del_init(&task->cg_list);
if (!css_set_populated(from_cset))
css_set_update_populated(from_cset, false);
Expand Down Expand Up @@ -1210,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
cset->dom_cset = cset;
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
INIT_LIST_HEAD(&cset->dying_tasks);
INIT_LIST_HEAD(&cset->task_iters);
INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
Expand Down Expand Up @@ -4408,15 +4413,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
it->task_pos = NULL;
return;
}
} while (!css_set_populated(cset));
} while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));

if (!list_empty(&cset->tasks))
it->task_pos = cset->tasks.next;
else
else if (!list_empty(&cset->mg_tasks))
it->task_pos = cset->mg_tasks.next;
else
it->task_pos = cset->dying_tasks.next;

it->tasks_head = &cset->tasks;
it->mg_tasks_head = &cset->mg_tasks;
it->dying_tasks_head = &cset->dying_tasks;

/*
* We don't keep css_sets locked across iteration steps and thus
Expand All @@ -4442,9 +4450,20 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
list_add(&it->iters_node, &cset->task_iters);
}

static void css_task_iter_skip(struct css_task_iter *it,
struct task_struct *task)
{
lockdep_assert_held(&css_set_lock);

if (it->task_pos == &task->cg_list) {
it->task_pos = it->task_pos->next;
it->flags |= CSS_TASK_ITER_SKIPPED;
}
}

static void css_task_iter_advance(struct css_task_iter *it)
{
struct list_head *next;
struct task_struct *task;

lockdep_assert_held(&css_set_lock);
repeat:
Expand All @@ -4454,25 +4473,40 @@ static void css_task_iter_advance(struct css_task_iter *it)
* consumed first and then ->mg_tasks. After ->mg_tasks,
* we move onto the next cset.
*/
next = it->task_pos->next;

if (next == it->tasks_head)
next = it->mg_tasks_head->next;
if (it->flags & CSS_TASK_ITER_SKIPPED)
it->flags &= ~CSS_TASK_ITER_SKIPPED;
else
it->task_pos = it->task_pos->next;

if (next == it->mg_tasks_head)
if (it->task_pos == it->tasks_head)
it->task_pos = it->mg_tasks_head->next;
if (it->task_pos == it->mg_tasks_head)
it->task_pos = it->dying_tasks_head->next;
if (it->task_pos == it->dying_tasks_head)
css_task_iter_advance_css_set(it);
else
it->task_pos = next;
} else {
/* called from start, proceed to the first cset */
css_task_iter_advance_css_set(it);
}

/* if PROCS, skip over tasks which aren't group leaders */
if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
!thread_group_leader(list_entry(it->task_pos, struct task_struct,
cg_list)))
goto repeat;
if (!it->task_pos)
return;

task = list_entry(it->task_pos, struct task_struct, cg_list);

if (it->flags & CSS_TASK_ITER_PROCS) {
/* if PROCS, skip over tasks which aren't group leaders */
if (!thread_group_leader(task))
goto repeat;

/* and dying leaders w/o live member threads */
if (!atomic_read(&task->signal->live))
goto repeat;
} else {
/* skip all dying ones */
if (task->flags & PF_EXITING)
goto repeat;
}
}

/**
Expand Down Expand Up @@ -4528,6 +4562,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)

spin_lock_irq(&css_set_lock);

/* @it may be half-advanced by skips, finish advancing */
if (it->flags & CSS_TASK_ITER_SKIPPED)
css_task_iter_advance(it);

if (it->task_pos) {
it->cur_task = list_entry(it->task_pos, struct task_struct,
cg_list);
Expand Down Expand Up @@ -6009,6 +6047,7 @@ void cgroup_exit(struct task_struct *tsk)
if (!list_empty(&tsk->cg_list)) {
spin_lock_irq(&css_set_lock);
css_set_move_task(tsk, cset, NULL, false);
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
cset->nr_tasks--;

WARN_ON_ONCE(cgroup_task_frozen(tsk));
Expand All @@ -6034,6 +6073,13 @@ void cgroup_release(struct task_struct *task)
do_each_subsys_mask(ss, ssid, have_release_callback) {
ss->release(task);
} while_each_subsys_mask();

if (use_task_css_set_links) {
spin_lock_irq(&css_set_lock);
css_set_skip_task_iters(task_css_set(task), task);
list_del_init(&task->cg_list);
spin_unlock_irq(&css_set_lock);
}
}

void cgroup_free(struct task_struct *task)
Expand Down
15 changes: 14 additions & 1 deletion kernel/cgroup/cpuset.c
Original file line number Diff line number Diff line change
Expand Up @@ -3254,10 +3254,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
spin_unlock_irqrestore(&callback_lock, flags);
}

/**
* cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
* @tsk: pointer to task_struct with which the scheduler is struggling
*
* Description: In the case that the scheduler cannot find an allowed cpu in
* tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
* mode however, this value is the same as task_cs(tsk)->effective_cpus,
* which will not contain a sane cpumask during cases such as cpu hotplugging.
* This is the absolute last resort for the scheduler and it is only used if
* _every_ other avenue has been traveled.
**/

void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
rcu_read_lock();
do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
do_set_cpus_allowed(tsk, is_in_v2_mode() ?
task_cs(tsk)->cpus_allowed : cpu_possible_mask);
rcu_read_unlock();

/*
Expand Down
2 changes: 1 addition & 1 deletion kernel/exit.c
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ void release_task(struct task_struct *p)
rcu_read_unlock();

proc_flush_task(p);
cgroup_release(p);

write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
Expand All @@ -220,7 +221,6 @@ void release_task(struct task_struct *p)
}

write_unlock_irq(&tasklist_lock);
cgroup_release(p);
release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct);

Expand Down

0 comments on commit 0011572

Please sign in to comment.