Skip to content

Commit

Permalink
writeback: make backing_dev_info host cgroup-specific bdi_writebacks
Browse files Browse the repository at this point in the history
For the planned cgroup writeback support, on each bdi
(backing_dev_info), each memcg will be served by a separate wb
(bdi_writeback).  This patch updates bdi so that a bdi can host
multiple wbs (bdi_writebacks).

On the default hierarchy, blkcg implicitly enables memcg.  This allows
using memcg's page ownership for attributing writeback IOs, and every
memcg - blkcg combination can be served by its own wb by assigning a
dedicated wb to each memcg.  This means that there may be multiple
wb's of a bdi mapped to the same blkcg.  As congested state is per
blkcg - bdi combination, those wb's should share the same congested
state.  This is achieved by tracking congested state via
bdi_writeback_congested structs which are keyed by blkcg.

bdi->wb remains unchanged and will keep serving the root cgroup.
cgwb's (cgroup wb's) for non-root cgroups are created on-demand or
looked up while dirtying an inode according to the memcg of the page
being dirtied or current task.  Each cgwb is indexed on bdi->cgwb_tree
by its memcg id.  Once an inode is associated with its wb, it can be
retrieved using inode_to_wb().

Currently, none of the filesystems has FS_CGROUP_WRITEBACK and all
pages will keep being associated with bdi->wb.

v3: inode_attach_wb() in account_page_dirtied() moved inside
    mapping_cap_account_dirty() block where it's known to be !NULL.
    Also, an unnecessary NULL check before kfree() removed.  Both
    detected by the kbuild bot.

v2: Updated so that wb association is per inode and wb is per memcg
    rather than blkcg.

Signed-off-by: Tejun Heo <[email protected]>
Cc: kbuild test robot <[email protected]>
Cc: Dan Carpenter <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Jan Kara <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>
  • Loading branch information
htejun authored and axboe committed Jun 2, 2015
1 parent 89e9b9e commit 52ebea7
Show file tree
Hide file tree
Showing 11 changed files with 698 additions and 11 deletions.
7 changes: 6 additions & 1 deletion block/blk-cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/err.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/slab.h>
#include <linux/genhd.h>
#include <linux/delay.h>
Expand Down Expand Up @@ -797,6 +798,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
}

spin_unlock_irq(&blkcg->lock);

wb_blkcg_offline(blkcg);
}

static void blkcg_css_free(struct cgroup_subsys_state *css)
Expand Down Expand Up @@ -827,7 +830,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
spin_lock_init(&blkcg->lock);
INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
INIT_HLIST_HEAD(&blkcg->blkg_list);

#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&blkcg->cgwb_list);
#endif
return &blkcg->css;
}

Expand Down
8 changes: 5 additions & 3 deletions fs/fs-writeback.c
Original file line number Diff line number Diff line change
Expand Up @@ -185,11 +185,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
*/
void inode_wb_list_del(struct inode *inode)
{
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct bdi_writeback *wb = inode_to_wb(inode);

spin_lock(&bdi->wb.list_lock);
spin_lock(&wb->list_lock);
list_del_init(&inode->i_wb_list);
spin_unlock(&bdi->wb.list_lock);
spin_unlock(&wb->list_lock);
}

/*
Expand Down Expand Up @@ -1268,6 +1268,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;

inode_attach_wb(inode, NULL);

if (flags & I_DIRTY_INODE)
inode->i_state &= ~I_DIRTY_TIME;
inode->i_state |= flags;
Expand Down
1 change: 1 addition & 0 deletions fs/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ EXPORT_SYMBOL(free_inode_nonrcu);
void __destroy_inode(struct inode *inode)
{
BUG_ON(inode_has_buffers(inode));
inode_detach_wb(inode);
security_inode_free(inode);
fsnotify_inode_delete(inode);
locks_free_lock_context(inode->i_flctx);
Expand Down
59 changes: 56 additions & 3 deletions include/linux/backing-dev-defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
#define __LINUX_BACKING_DEV_DEFS_H

#include <linux/list.h>
#include <linux/radix-tree.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/percpu_counter.h>
#include <linux/percpu-refcount.h>
#include <linux/flex_proportions.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
Expand Down Expand Up @@ -37,10 +40,43 @@ enum wb_stat_item {

#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))

/*
* For cgroup writeback, multiple wb's may map to the same blkcg. Those
* wb's can operate mostly independently but should share the congested
* state. To facilitate such sharing, the congested state is tracked using
* the following struct which is created on demand, indexed by blkcg ID on
* its bdi, and refcounted.
*/
struct bdi_writeback_congested {
unsigned long state; /* WB_[a]sync_congested flags */

#ifdef CONFIG_CGROUP_WRITEBACK
struct backing_dev_info *bdi; /* the associated bdi */
atomic_t refcnt; /* nr of attached wb's and blkg */
int blkcg_id; /* ID of the associated blkcg */
struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */
#endif
};

/*
* Each wb (bdi_writeback) can perform writeback operations, is measured
* and throttled, independently. Without cgroup writeback, each bdi
* (bdi_writeback) is served by its embedded bdi->wb.
*
* On the default hierarchy, blkcg implicitly enables memcg. This allows
* using memcg's page ownership for attributing writeback IOs, and every
* memcg - blkcg combination can be served by its own wb by assigning a
* dedicated wb to each memcg, which enables isolation across different
* cgroups and propagation of IO back pressure down from the IO layer upto
* the tasks which are generating the dirty pages to be written back.
*
* A cgroup wb is indexed on its bdi by the ID of the associated memcg,
* refcounted with the number of inodes attached to it, and pins the memcg
* and the corresponding blkcg. As the corresponding blkcg for a memcg may
* change as blkcg is disabled and enabled higher up in the hierarchy, a wb
* is tested for blkcg after lookup and removed from index on mismatch so
* that a new wb for the combination can be created.
*/
struct bdi_writeback {
struct backing_dev_info *bdi; /* our parent bdi */

Expand Down Expand Up @@ -78,6 +114,19 @@ struct bdi_writeback {
spinlock_t work_lock; /* protects work_list & dwork scheduling */
struct list_head work_list;
struct delayed_work dwork; /* work item used for writeback */

#ifdef CONFIG_CGROUP_WRITEBACK
struct percpu_ref refcnt; /* used only for !root wb's */
struct cgroup_subsys_state *memcg_css; /* the associated memcg */
struct cgroup_subsys_state *blkcg_css; /* and blkcg */
struct list_head memcg_node; /* anchored at memcg->cgwb_list */
struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */

union {
struct work_struct release_work;
struct rcu_head rcu;
};
#endif
};

struct backing_dev_info {
Expand All @@ -92,9 +141,13 @@ struct backing_dev_info {
unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac;

struct bdi_writeback wb; /* default writeback info for this bdi */
struct bdi_writeback_congested wb_congested;

struct bdi_writeback wb; /* the root writeback info for this bdi */
struct bdi_writeback_congested wb_congested; /* its congested state */
#ifdef CONFIG_CGROUP_WRITEBACK
struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
struct rb_root cgwb_congested_tree; /* their congested states */
atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
#endif
struct device *dev;

struct timer_list laptop_mode_wb_timer;
Expand Down
195 changes: 195 additions & 0 deletions include/linux/backing-dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <linux/sched.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
#include <linux/blk-cgroup.h>
#include <linux/backing-dev-defs.h>

int __must_check bdi_init(struct backing_dev_info *bdi);
Expand Down Expand Up @@ -234,6 +235,16 @@ static inline int bdi_sched_wait(void *word)

#ifdef CONFIG_CGROUP_WRITEBACK

struct bdi_writeback_congested *
wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp);
void wb_congested_put(struct bdi_writeback_congested *congested);
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css,
gfp_t gfp);
void __inode_attach_wb(struct inode *inode, struct page *page);
void wb_memcg_offline(struct mem_cgroup *memcg);
void wb_blkcg_offline(struct blkcg *blkcg);

/**
* inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
* @inode: inode of interest
Expand All @@ -250,13 +261,197 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
(inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK);
}

/**
* wb_tryget - try to increment a wb's refcount
* @wb: bdi_writeback to get
*/
static inline bool wb_tryget(struct bdi_writeback *wb)
{
if (wb != &wb->bdi->wb)
return percpu_ref_tryget(&wb->refcnt);
return true;
}

/**
* wb_get - increment a wb's refcount
* @wb: bdi_writeback to get
*/
static inline void wb_get(struct bdi_writeback *wb)
{
if (wb != &wb->bdi->wb)
percpu_ref_get(&wb->refcnt);
}

/**
* wb_put - decrement a wb's refcount
* @wb: bdi_writeback to put
*/
static inline void wb_put(struct bdi_writeback *wb)
{
if (wb != &wb->bdi->wb)
percpu_ref_put(&wb->refcnt);
}

/**
* wb_find_current - find wb for %current on a bdi
* @bdi: bdi of interest
*
* Find the wb of @bdi which matches both the memcg and blkcg of %current.
* Must be called under rcu_read_lock() which protects the returend wb.
* NULL if not found.
*/
static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
struct cgroup_subsys_state *memcg_css;
struct bdi_writeback *wb;

memcg_css = task_css(current, memory_cgrp_id);
if (!memcg_css->parent)
return &bdi->wb;

wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);

/*
* %current's blkcg equals the effective blkcg of its memcg. No
* need to use the relatively expensive cgroup_get_e_css().
*/
if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
return wb;
return NULL;
}

/**
* wb_get_create_current - get or create wb for %current on a bdi
* @bdi: bdi of interest
* @gfp: allocation mask
*
* Equivalent to wb_get_create() on %current's memcg. This function is
* called from a relatively hot path and optimizes the common cases using
* wb_find_current().
*/
static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
struct bdi_writeback *wb;

rcu_read_lock();
wb = wb_find_current(bdi);
if (wb && unlikely(!wb_tryget(wb)))
wb = NULL;
rcu_read_unlock();

if (unlikely(!wb)) {
struct cgroup_subsys_state *memcg_css;

memcg_css = task_get_css(current, memory_cgrp_id);
wb = wb_get_create(bdi, memcg_css, gfp);
css_put(memcg_css);
}
return wb;
}

/**
* inode_attach_wb - associate an inode with its wb
* @inode: inode of interest
* @page: page being dirtied (may be NULL)
*
* If @inode doesn't have its wb, associate it with the wb matching the
* memcg of @page or, if @page is NULL, %current. May be called w/ or w/o
* @inode->i_lock.
*/
static inline void inode_attach_wb(struct inode *inode, struct page *page)
{
if (!inode->i_wb)
__inode_attach_wb(inode, page);
}

/**
* inode_detach_wb - disassociate an inode from its wb
* @inode: inode of interest
*
* @inode is being freed. Detach from its wb.
*/
static inline void inode_detach_wb(struct inode *inode)
{
if (inode->i_wb) {
wb_put(inode->i_wb);
inode->i_wb = NULL;
}
}

/**
* inode_to_wb - determine the wb of an inode
* @inode: inode of interest
*
* Returns the wb @inode is currently associated with.
*/
static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
{
return inode->i_wb;
}

#else /* CONFIG_CGROUP_WRITEBACK */

static inline bool inode_cgwb_enabled(struct inode *inode)
{
return false;
}

static inline struct bdi_writeback_congested *
wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
{
return bdi->wb.congested;
}

static inline void wb_congested_put(struct bdi_writeback_congested *congested)
{
}

static inline bool wb_tryget(struct bdi_writeback *wb)
{
return true;
}

static inline void wb_get(struct bdi_writeback *wb)
{
}

static inline void wb_put(struct bdi_writeback *wb)
{
}

static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
return &bdi->wb;
}

static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
return &bdi->wb;
}

static inline void inode_attach_wb(struct inode *inode, struct page *page)
{
}

static inline void inode_detach_wb(struct inode *inode)
{
}

static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
{
return &inode_to_bdi(inode)->wb;
}

static inline void wb_memcg_offline(struct mem_cgroup *memcg)
{
}

static inline void wb_blkcg_offline(struct blkcg *blkcg)
{
}

#endif /* CONFIG_CGROUP_WRITEBACK */

#endif /* _LINUX_BACKING_DEV_H */
4 changes: 4 additions & 0 deletions include/linux/blk-cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ struct blkcg {
/* TODO: per-policy storage in blkcg */
unsigned int cfq_weight; /* belongs to cfq */
unsigned int cfq_leaf_weight;

#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head cgwb_list;
#endif
};

struct blkg_stat {
Expand Down
Loading

0 comments on commit 52ebea7

Please sign in to comment.