Skip to content

Commit

Permalink
cgroup: use cgroup->self.refcnt for cgroup refcnting
Browse files Browse the repository at this point in the history
Currently cgroup implements refcnting separately using atomic_t
cgroup->refcnt.  The destruction paths of cgroup and css are rather
complex and bear a lot of similiarities including the use of RCU and
bouncing to a work item.

This patch makes cgroup use the refcnt of self css for refcnting
instead of using its own.  This makes cgroup refcnting use css's
percpu refcnt and share the destruction mechanism.

* css_release_work_fn() and css_free_work_fn() are updated to handle
  both csses and cgroups.  This is a bit messy but should do until we
  can make cgroup->self a full css, which currently can't be done
  thanks to multiple hierarchies.

* cgroup_destroy_locked() now performs
  percpu_ref_kill(&cgrp->self.refcnt) instead of cgroup_put(cgrp).

* Negative refcnt sanity check in cgroup_get() is no longer necessary
  as percpu_ref already handles it.

* Similarly, as a cgroup which hasn't been killed will never be
  released regardless of its refcnt value and percpu_ref has sanity
  check on kill, cgroup_is_dead() sanity check in cgroup_put() is no
  longer necessary.

* As whether a refcnt reached zero or not can only be decided after
  the reference count is killed, cgroup_root->cgrp's refcnting can no
  longer be used to decide whether to kill the root or not.  Let's
  make cgroup_kill_sb() explicitly initiate destruction if the root
  doesn't have any children.  This makes sense anyway as unmounted
  cgroup hierarchy without any children should be destroyed.

While this is a bit messy, this will allow pushing more bookkeeping
towards cgroup->self and thus handling cgroups and csses in more
uniform way.  In the very long term, it should be possible to
introduce a base subsystem and convert the self css to a proper one
making things whole lot simpler and unified.

Signed-off-by: Tejun Heo <[email protected]>
Acked-by: Li Zefan <[email protected]>
  • Loading branch information
htejun committed May 14, 2014
1 parent 9395a45 commit 9d755d3
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 72 deletions.
6 changes: 0 additions & 6 deletions include/linux/cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,6 @@ struct cgroup {
*/
int populated_cnt;

atomic_t refcnt;

/*
* We link our 'sibling' struct into our parent's 'children'.
* Our children link their 'sibling' into our 'children'.
Expand Down Expand Up @@ -218,10 +216,6 @@ struct cgroup {
struct list_head pidlists;
struct mutex pidlist_mutex;

/* For css percpu_ref killing and RCU-protected deletion */
struct rcu_head rcu_head;
struct work_struct destroy_work;

/* used to wait for offlining of csses */
wait_queue_head_t offline_waitq;
};
Expand Down
146 changes: 80 additions & 66 deletions kernel/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,12 @@ static int need_forkexit_callback __read_mostly;
static struct cftype cgroup_base_files[];

static void cgroup_put(struct cgroup *cgrp);
static bool cgroup_has_live_children(struct cgroup *cgrp);
static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned int ss_mask);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
Expand Down Expand Up @@ -1008,62 +1010,15 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
return mode;
}

static void cgroup_free_fn(struct work_struct *work)
{
struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);

atomic_dec(&cgrp->root->nr_cgrps);
cgroup_pidlist_destroy_all(cgrp);

if (cgrp->parent) {
/*
* We get a ref to the parent, and put the ref when this
* cgroup is being freed, so it's guaranteed that the
* parent won't be destroyed before its children.
*/
cgroup_put(cgrp->parent);
kernfs_put(cgrp->kn);
kfree(cgrp);
} else {
/*
* This is root cgroup's refcnt reaching zero, which
* indicates that the root should be released.
*/
cgroup_destroy_root(cgrp->root);
}
}

static void cgroup_free_rcu(struct rcu_head *head)
{
struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);

INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
}

static void cgroup_get(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
atomic_inc(&cgrp->refcnt);
css_get(&cgrp->self);
}

static void cgroup_put(struct cgroup *cgrp)
{
if (!atomic_dec_and_test(&cgrp->refcnt))
return;
if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
return;

/* delete this cgroup from parent->children */
mutex_lock(&cgroup_mutex);
list_del_rcu(&cgrp->sibling);
mutex_unlock(&cgroup_mutex);

cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;

call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
css_put(&cgrp->self);
}

/**
Expand Down Expand Up @@ -1548,7 +1503,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
struct cgroup_subsys *ss;
int ssid;

atomic_set(&cgrp->refcnt, 1);
INIT_LIST_HEAD(&cgrp->sibling);
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->cset_links);
Expand Down Expand Up @@ -1597,6 +1551,10 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
goto out;
root_cgrp->id = ret;

ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
if (ret)
goto out;

/*
* We're accessing css_set_count without locking css_set_rwsem here,
* but that's OK - it can only be increased by someone holding
Expand All @@ -1605,11 +1563,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
*/
ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
if (ret)
goto out;
goto cancel_ref;

ret = cgroup_init_root_id(root);
if (ret)
goto out;
goto cancel_ref;

root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
KERNFS_ROOT_CREATE_DEACTIVATED,
Expand Down Expand Up @@ -1657,6 +1615,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
root->kf_root = NULL;
exit_root_id:
cgroup_exit_root_id(root);
cancel_ref:
percpu_ref_cancel_init(&root_cgrp->self.refcnt);
out:
free_cgrp_cset_links(&tmp_links);
return ret;
Expand Down Expand Up @@ -1735,13 +1695,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
}

/*
* A root's lifetime is governed by its root cgroup. Zero
* ref indicate that the root is being destroyed. Wait for
* destruction to complete so that the subsystems are free.
* We can use wait_queue for the wait but this path is
* super cold. Let's just sleep for a bit and retry.
* A root's lifetime is governed by its root cgroup.
* tryget_live failure indicate that the root is being
* destroyed. Wait for destruction to complete so that the
* subsystems are free. We can use wait_queue for the wait
* but this path is super cold. Let's just sleep for a bit
* and retry.
*/
if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
mutex_unlock(&cgroup_mutex);
msleep(10);
ret = restart_syscall();
Expand Down Expand Up @@ -1794,7 +1755,16 @@ static void cgroup_kill_sb(struct super_block *sb)
struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
struct cgroup_root *root = cgroup_root_from_kf(kf_root);

cgroup_put(&root->cgrp);
/*
* If @root doesn't have any mounts or children, start killing it.
* This prevents new mounts by disabling percpu_ref_tryget_live().
* cgroup_mount() may wait for @root's release.
*/
if (cgroup_has_live_children(&root->cgrp))
cgroup_put(&root->cgrp);
else
percpu_ref_kill(&root->cgrp.self.refcnt);

kernfs_kill_sb(sb);
}

Expand Down Expand Up @@ -4110,11 +4080,37 @@ static void css_free_work_fn(struct work_struct *work)
container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup *cgrp = css->cgroup;

if (css->parent)
css_put(css->parent);
if (css->ss) {
/* css free path */
if (css->parent)
css_put(css->parent);

css->ss->css_free(css);
cgroup_put(cgrp);
css->ss->css_free(css);
cgroup_put(cgrp);
} else {
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
cgroup_pidlist_destroy_all(cgrp);

if (cgrp->parent) {
/*
* We get a ref to the parent, and put the ref when
* this cgroup is being freed, so it's guaranteed
* that the parent won't be destroyed before its
* children.
*/
cgroup_put(cgrp->parent);
kernfs_put(cgrp->kn);
kfree(cgrp);
} else {
/*
* This is root cgroup's refcnt reaching zero,
* which indicates that the root should be
* released.
*/
cgroup_destroy_root(cgrp->root);
}
}
}

static void css_free_rcu_fn(struct rcu_head *rcu_head)
Expand All @@ -4131,8 +4127,20 @@ static void css_release_work_fn(struct work_struct *work)
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;

cgroup_idr_remove(&ss->css_idr, css->id);
if (ss) {
/* css release path */
cgroup_idr_remove(&ss->css_idr, css->id);
} else {
/* cgroup release path */
mutex_lock(&cgroup_mutex);
list_del_rcu(&cgrp->sibling);
mutex_unlock(&cgroup_mutex);

cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
}

call_rcu(&css->rcu_head, css_free_rcu_fn);
}
Expand Down Expand Up @@ -4285,14 +4293,18 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
goto out_unlock;
}

ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
if (ret)
goto out_free_cgrp;

/*
* Temporarily set the pointer to NULL, so idr_find() won't return
* a half-baked cgroup.
*/
cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
if (cgrp->id < 0) {
ret = -ENOMEM;
goto out_free_cgrp;
goto out_cancel_ref;
}

init_cgroup_housekeeping(cgrp);
Expand Down Expand Up @@ -4365,6 +4377,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,

out_free_id:
cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
out_cancel_ref:
percpu_ref_cancel_init(&cgrp->self.refcnt);
out_free_cgrp:
kfree(cgrp);
out_unlock:
Expand Down Expand Up @@ -4521,7 +4535,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
check_for_release(cgrp->parent);

/* put the base reference */
cgroup_put(cgrp);
percpu_ref_kill(&cgrp->self.refcnt);

return 0;
};
Expand Down

0 comments on commit 9d755d3

Please sign in to comment.