Skip to content

Commit

Permalink
blkcg: implement REQ_CGROUP_PUNT
Browse files Browse the repository at this point in the history
When a shared kthread needs to issue a bio for a cgroup, doing so
synchronously can lead to priority inversions as the kthread can be
trapped waiting for that cgroup.  This patch implements
REQ_CGROUP_PUNT flag which makes submit_bio() punt the actual issuing
to a dedicated per-blkcg work item to avoid such priority inversions.

This will be used to fix priority inversions in btrfs compression and
should be generally useful as we grow filesystem support for
comprehensive IO control.

Cc: Chris Mason <[email protected]>
Reviewed-by: Josef Bacik <[email protected]>
Reviewed-by: Jan Kara <[email protected]>
Signed-off-by: Tejun Heo <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>
  • Loading branch information
htejun authored and axboe committed Jul 10, 2019
1 parent 653c45c commit d3f77df
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 4 deletions.
53 changes: 53 additions & 0 deletions block/blk-cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */

static bool blkcg_debug_stats = false;
static struct workqueue_struct *blkcg_punt_bio_wq;

static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol)
Expand Down Expand Up @@ -89,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu)
{
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);

WARN_ON(!bio_list_empty(&blkg->async_bios));

/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
if (blkg->parent)
Expand All @@ -114,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref)
call_rcu(&blkg->rcu_head, __blkg_release);
}

static void blkg_async_bio_workfn(struct work_struct *work)
{
struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
async_bio_work);
struct bio_list bios = BIO_EMPTY_LIST;
struct bio *bio;

/* as long as there are pending bios, @blkg can't go away */
spin_lock_bh(&blkg->async_bio_lock);
bio_list_merge(&bios, &blkg->async_bios);
bio_list_init(&blkg->async_bios);
spin_unlock_bh(&blkg->async_bio_lock);

while ((bio = bio_list_pop(&bios)))
submit_bio(bio);
}

/**
* blkg_alloc - allocate a blkg
* @blkcg: block cgroup the new blkg is associated with
Expand Down Expand Up @@ -142,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,

blkg->q = q;
INIT_LIST_HEAD(&blkg->q_node);
spin_lock_init(&blkg->async_bio_lock);
bio_list_init(&blkg->async_bios);
INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
blkg->blkcg = blkcg;

for (i = 0; i < BLKCG_MAX_POLS; i++) {
Expand Down Expand Up @@ -1528,6 +1551,25 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
}
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);

bool __blkcg_punt_bio_submit(struct bio *bio)
{
struct blkcg_gq *blkg = bio->bi_blkg;

/* consume the flag first */
bio->bi_opf &= ~REQ_CGROUP_PUNT;

/* never bounce for the root cgroup */
if (!blkg->parent)
return false;

spin_lock_bh(&blkg->async_bio_lock);
bio_list_add(&blkg->async_bios, bio);
spin_unlock_bh(&blkg->async_bio_lock);

queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
return true;
}

/*
* Scale the accumulated delay based on how long it has been since we updated
* the delay. We only call this when we are adding delay, in case it's been a
Expand Down Expand Up @@ -1729,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
atomic64_add(delta, &blkg->delay_nsec);
}

static int __init blkcg_init(void)
{
blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
WQ_MEM_RECLAIM | WQ_FREEZABLE |
WQ_UNBOUND | WQ_SYSFS, 0);
if (!blkcg_punt_bio_wq)
return -ENOMEM;
return 0;
}
subsys_initcall(blkcg_init);

module_param(blkcg_debug_stats, bool, 0644);
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
3 changes: 3 additions & 0 deletions block/blk-core.c
Original file line number Diff line number Diff line change
Expand Up @@ -1128,6 +1128,9 @@ EXPORT_SYMBOL_GPL(direct_make_request);
*/
blk_qc_t submit_bio(struct bio *bio)
{
if (blkcg_punt_bio_submit(bio))
return BLK_QC_T_NONE;

/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
Expand Down
1 change: 1 addition & 0 deletions include/linux/backing-dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ extern spinlock_t bdi_lock;
extern struct list_head bdi_list;

extern struct workqueue_struct *bdi_wq;
extern struct workqueue_struct *bdi_async_bio_wq;

static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
{
Expand Down
16 changes: 15 additions & 1 deletion include/linux/blk-cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,13 +132,17 @@ struct blkcg_gq {

struct blkg_policy_data *pd[BLKCG_MAX_POLS];

struct rcu_head rcu_head;
spinlock_t async_bio_lock;
struct bio_list async_bios;
struct work_struct async_bio_work;

atomic_t use_delay;
atomic64_t delay_nsec;
atomic64_t delay_start;
u64 last_delay;
int last_use;

struct rcu_head rcu_head;
};

typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
Expand Down Expand Up @@ -701,6 +705,15 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg
struct bio *bio) { return false; }
#endif

bool __blkcg_punt_bio_submit(struct bio *bio);

static inline bool blkcg_punt_bio_submit(struct bio *bio)
{
if (bio->bi_opf & REQ_CGROUP_PUNT)
return __blkcg_punt_bio_submit(bio);
else
return false;
}

static inline void blkcg_bio_issue_init(struct bio *bio)
{
Expand Down Expand Up @@ -848,6 +861,7 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
static inline void blkg_get(struct blkcg_gq *blkg) { }
static inline void blkg_put(struct blkcg_gq *blkg) { }

static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
static inline void blkcg_bio_issue_init(struct bio *bio) { }
static inline bool blkcg_bio_issue_check(struct request_queue *q,
struct bio *bio) { return true; }
Expand Down
10 changes: 10 additions & 0 deletions include/linux/blk_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,14 @@ enum req_flag_bits {
__REQ_RAHEAD, /* read ahead, can fail anytime */
__REQ_BACKGROUND, /* background IO */
__REQ_NOWAIT, /* Don't wait if request will block */
/*
* When a shared kthread needs to issue a bio for a cgroup, doing
* so synchronously can lead to priority inversions as the kthread
* can be trapped waiting for that cgroup. CGROUP_PUNT flag makes
* submit_bio() punt the actual issuing to a dedicated per-blkcg
* work item to avoid such priority inversions.
*/
__REQ_CGROUP_PUNT,

/* command specific flags for REQ_OP_WRITE_ZEROES: */
__REQ_NOUNMAP, /* do not free blocks when zeroing */
Expand All @@ -337,6 +345,8 @@ enum req_flag_bits {
#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
#define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
#define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT)

#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
#define REQ_HIPRI (1ULL << __REQ_HIPRI)

Expand Down
13 changes: 10 additions & 3 deletions include/linux/writeback.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ struct writeback_control {
*/
unsigned no_cgroup_owner:1;

unsigned punt_to_cgroup:1; /* cgrp punting, see __REQ_CGROUP_PUNT */

#ifdef CONFIG_CGROUP_WRITEBACK
struct bdi_writeback *wb; /* wb this writeback is issued under */
struct inode *inode; /* inode being written out */
Expand All @@ -94,12 +96,17 @@ struct writeback_control {

static inline int wbc_to_write_flags(struct writeback_control *wbc)
{
int flags = 0;

if (wbc->punt_to_cgroup)
flags = REQ_CGROUP_PUNT;

if (wbc->sync_mode == WB_SYNC_ALL)
return REQ_SYNC;
flags |= REQ_SYNC;
else if (wbc->for_kupdate || wbc->for_background)
return REQ_BACKGROUND;
flags |= REQ_BACKGROUND;

return 0;
return flags;
}

static inline struct cgroup_subsys_state *
Expand Down

0 comments on commit d3f77df

Please sign in to comment.