Skip to content

Commit

Permalink
cgroup: fix frequent -EBUSY at rmdir
Browse files Browse the repository at this point in the history
In following situation, with memory subsystem,

	/groupA use_hierarchy==1
		/01 some tasks
		/02 some tasks
		/03 some tasks
		/04 empty

When tasks under 01/02/03 hit limit on /groupA, hierarchical reclaim
is triggered and the kernel walks tree under groupA. In this case,
rmdir /groupA/04 fails with -EBUSY frequently because of temporal
refcnt from the kernel.

In general. cgroup can be rmdir'd if there are no children groups and
no tasks. Frequent fails of rmdir() is not useful to users.
(And the reason for -EBUSY is unknown to users.....in most cases)

This patch tries to modify above behavior, by
	- retries if css_refcnt is got by someone.
	- add "return value" to pre_destroy() and allows subsystem to
	  say "we're really busy!"

Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
Cc: Paul Menage <[email protected]>
Cc: Li Zefan <[email protected]>
Cc: Balbir Singh <[email protected]>
Cc: Daisuke Nishimura <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
hkamezawa authored and torvalds committed Apr 3, 2009
1 parent 38460b4 commit ec64f51
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 19 deletions.
6 changes: 4 additions & 2 deletions Documentation/cgroups/cgroups.txt
Original file line number Diff line number Diff line change
Expand Up @@ -476,11 +476,13 @@ cgroup->parent is still valid. (Note - can also be called for a
newly-created cgroup if an error occurs after this subsystem's
create() method has been called for the new cgroup).

void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
int pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);

Called before checking the reference count on each subsystem. This may
be useful for subsystems which have some extra references even if
there are not tasks in the cgroup.
there are not tasks in the cgroup. If pre_destroy() returns error code,
rmdir() will fail with it. From this behavior, pre_destroy() can be
called multiple times against a cgroup.

int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *task)
Expand Down
6 changes: 5 additions & 1 deletion include/linux/cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,10 @@ enum {
CGRP_RELEASABLE,
/* Control Group requires release notifications to userspace */
CGRP_NOTIFY_ON_RELEASE,
/*
* A thread in rmdir() is wating for this cgroup.
*/
CGRP_WAIT_ON_RMDIR,
};

struct cgroup {
Expand Down Expand Up @@ -360,7 +364,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
struct cgroup_subsys {
struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
struct cgroup *cgrp);
void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
int (*can_attach)(struct cgroup_subsys *ss,
struct cgroup *cgrp, struct task_struct *tsk);
Expand Down
81 changes: 67 additions & 14 deletions kernel/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -622,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
* Call subsys's pre_destroy handler.
* This is called before css refcnt check.
*/
static void cgroup_call_pre_destroy(struct cgroup *cgrp)
static int cgroup_call_pre_destroy(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
int ret = 0;

for_each_subsys(cgrp->root, ss)
if (ss->pre_destroy)
ss->pre_destroy(ss, cgrp);
return;
if (ss->pre_destroy) {
ret = ss->pre_destroy(ss, cgrp);
if (ret)
break;
}
return ret;
}

static void free_cgroup_rcu(struct rcu_head *obj)
Expand Down Expand Up @@ -722,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
remove_dir(dentry);
}

/*
* A queue for waiters to do rmdir() cgroup. A tasks will sleep when
* cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
* reference to css->refcnt. In general, this refcnt is expected to goes down
* to zero, soon.
*
* CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
*/
DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);

static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
{
if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
wake_up_all(&cgroup_rmdir_waitq);
}

static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long final_bits)
{
Expand Down Expand Up @@ -1317,6 +1338,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
synchronize_rcu();
put_css_set(cg);

/*
* wake up rmdir() waiter. the rmdir should fail since the cgroup
* is no longer empty.
*/
cgroup_wakeup_rmdir_waiters(cgrp);
return 0;
}

Expand Down Expand Up @@ -2608,9 +2635,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
struct cgroup *cgrp = dentry->d_fsdata;
struct dentry *d;
struct cgroup *parent;
DEFINE_WAIT(wait);
int ret;

/* the vfs holds both inode->i_mutex already */

again:
mutex_lock(&cgroup_mutex);
if (atomic_read(&cgrp->count) != 0) {
mutex_unlock(&cgroup_mutex);
Expand All @@ -2626,17 +2655,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
* Call pre_destroy handlers of subsys. Notify subsystems
* that rmdir() request comes.
*/
cgroup_call_pre_destroy(cgrp);
ret = cgroup_call_pre_destroy(cgrp);
if (ret)
return ret;

mutex_lock(&cgroup_mutex);
parent = cgrp->parent;

if (atomic_read(&cgrp->count)
|| !list_empty(&cgrp->children)
|| !cgroup_clear_css_refs(cgrp)) {
if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
/*
* css_put/get is provided for subsys to grab refcnt to css. In typical
* case, subsystem has no reference after pre_destroy(). But, under
* hierarchy management, some *temporal* refcnt can be hold.
* To avoid returning -EBUSY to a user, waitqueue is used. If subsys
* is really busy, it should return -EBUSY at pre_destroy(). wake_up
* is called when css_put() is called and refcnt goes down to 0.
*/
set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);

if (!cgroup_clear_css_refs(cgrp)) {
mutex_unlock(&cgroup_mutex);
schedule();
finish_wait(&cgroup_rmdir_waitq, &wait);
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
if (signal_pending(current))
return -EINTR;
goto again;
}
/* NO css_tryget() can success after here. */
finish_wait(&cgroup_rmdir_waitq, &wait);
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);

spin_lock(&release_list_lock);
set_bit(CGRP_REMOVED, &cgrp->flags);
Expand Down Expand Up @@ -3194,10 +3245,12 @@ void __css_put(struct cgroup_subsys_state *css)
{
struct cgroup *cgrp = css->cgroup;
rcu_read_lock();
if ((atomic_dec_return(&css->refcnt) == 1) &&
notify_on_release(cgrp)) {
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
if (atomic_dec_return(&css->refcnt) == 1) {
if (notify_on_release(cgrp)) {
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
cgroup_wakeup_rmdir_waiters(cgrp);
}
rcu_read_unlock();
}
Expand Down
5 changes: 3 additions & 2 deletions mm/memcontrol.c
Original file line number Diff line number Diff line change
Expand Up @@ -2272,11 +2272,12 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
return ERR_PTR(-ENOMEM);
}

static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
struct cgroup *cont)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
mem_cgroup_force_empty(mem, false);

return mem_cgroup_force_empty(mem, false);
}

static void mem_cgroup_destroy(struct cgroup_subsys *ss,
Expand Down

0 comments on commit ec64f51

Please sign in to comment.