Skip to content

Commit

Permalink
Merge tag 'for-linus-20180210' of git://git.kernel.dk/linux-block
Browse files Browse the repository at this point in the history
Pull block fixes from Jens Axboe:
 "A few fixes to round off the merge window on the block side:

   - a set of bcache fixes by way of Michael Lyle, from the usual bcache
     suspects.

   - add a simple-to-hook-into function for bpf EIO error injection.

   - fix blk-wbt that mischarectized flushes as reads. Improve the logic
     so that flushes and writes are accounted as writes, and only reads
     as reads. From me.

   - fix requeue crash in BFQ, from Paolo"

* tag 'for-linus-20180210' of git://git.kernel.dk/linux-block:
  block, bfq: add requeue-request hook
  bcache: fix for data collapse after re-attaching an attached device
  bcache: return attach error when no cache set exist
  bcache: set writeback_rate_update_seconds in range [1, 60] seconds
  bcache: fix for allocator and register thread race
  bcache: set error_limit correctly
  bcache: properly set task state in bch_writeback_thread()
  bcache: fix high CPU occupancy during journal
  bcache: add journal statistic
  block: Add should_fail_bio() for bpf error injection
  blk-wbt: account flush requests correctly
  • Loading branch information
torvalds committed Feb 10, 2018
2 parents cc5cb5a + 8525e5f commit 9454473
Show file tree
Hide file tree
Showing 12 changed files with 212 additions and 63 deletions.
107 changes: 82 additions & 25 deletions block/bfq-iosched.c
Original file line number Diff line number Diff line change
Expand Up @@ -3823,24 +3823,26 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
}

/*
* We exploit the bfq_finish_request hook to decrement
* rq_in_driver, but bfq_finish_request will not be
* invoked on this request. So, to avoid unbalance,
* just start this request, without incrementing
* rq_in_driver. As a negative consequence,
* rq_in_driver is deceptively lower than it should be
* while this request is in service. This may cause
* bfq_schedule_dispatch to be invoked uselessly.
* We exploit the bfq_finish_requeue_request hook to
* decrement rq_in_driver, but
* bfq_finish_requeue_request will not be invoked on
* this request. So, to avoid unbalance, just start
* this request, without incrementing rq_in_driver. As
* a negative consequence, rq_in_driver is deceptively
* lower than it should be while this request is in
* service. This may cause bfq_schedule_dispatch to be
* invoked uselessly.
*
* As for implementing an exact solution, the
* bfq_finish_request hook, if defined, is probably
* invoked also on this request. So, by exploiting
* this hook, we could 1) increment rq_in_driver here,
* and 2) decrement it in bfq_finish_request. Such a
* solution would let the value of the counter be
* always accurate, but it would entail using an extra
* interface function. This cost seems higher than the
* benefit, being the frequency of non-elevator-private
* bfq_finish_requeue_request hook, if defined, is
* probably invoked also on this request. So, by
* exploiting this hook, we could 1) increment
* rq_in_driver here, and 2) decrement it in
* bfq_finish_requeue_request. Such a solution would
* let the value of the counter be always accurate,
* but it would entail using an extra interface
* function. This cost seems higher than the benefit,
* being the frequency of non-elevator-private
* requests very low.
*/
goto start_rq;
Expand Down Expand Up @@ -4515,6 +4517,8 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
unsigned int cmd_flags) {}
#endif

static void bfq_prepare_request(struct request *rq, struct bio *bio);

static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head)
{
Expand All @@ -4541,6 +4545,18 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
else
list_add_tail(&rq->queuelist, &bfqd->dispatch);
} else {
if (WARN_ON_ONCE(!bfqq)) {
/*
* This should never happen. Most likely rq is
* a requeued regular request, being
* re-inserted without being first
* re-prepared. Do a prepare, to avoid
* failure.
*/
bfq_prepare_request(rq, rq->bio);
bfqq = RQ_BFQQ(rq);
}

idle_timer_disabled = __bfq_insert_request(bfqd, rq);
/*
* Update bfqq, because, if a queue merge has occurred
Expand Down Expand Up @@ -4697,22 +4713,44 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
bfq_schedule_dispatch(bfqd);
}

static void bfq_finish_request_body(struct bfq_queue *bfqq)
static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq)
{
bfqq->allocated--;

bfq_put_queue(bfqq);
}

static void bfq_finish_request(struct request *rq)
/*
* Handle either a requeue or a finish for rq. The things to do are
* the same in both cases: all references to rq are to be dropped. In
* particular, rq is considered completed from the point of view of
* the scheduler.
*/
static void bfq_finish_requeue_request(struct request *rq)
{
struct bfq_queue *bfqq;
struct bfq_queue *bfqq = RQ_BFQQ(rq);
struct bfq_data *bfqd;

if (!rq->elv.icq)
/*
* Requeue and finish hooks are invoked in blk-mq without
* checking whether the involved request is actually still
* referenced in the scheduler. To handle this fact, the
* following two checks make this function exit in case of
* spurious invocations, for which there is nothing to do.
*
* First, check whether rq has nothing to do with an elevator.
*/
if (unlikely(!(rq->rq_flags & RQF_ELVPRIV)))
return;

/*
* rq either is not associated with any icq, or is an already
* requeued request that has not (yet) been re-inserted into
* a bfq_queue.
*/
if (!rq->elv.icq || !bfqq)
return;

bfqq = RQ_BFQQ(rq);
bfqd = bfqq->bfqd;

if (rq->rq_flags & RQF_STARTED)
Expand All @@ -4727,13 +4765,14 @@ static void bfq_finish_request(struct request *rq)
spin_lock_irqsave(&bfqd->lock, flags);

bfq_completed_request(bfqq, bfqd);
bfq_finish_request_body(bfqq);
bfq_finish_requeue_request_body(bfqq);

spin_unlock_irqrestore(&bfqd->lock, flags);
} else {
/*
* Request rq may be still/already in the scheduler,
* in which case we need to remove it. And we cannot
* in which case we need to remove it (this should
* never happen in case of requeue). And we cannot
* defer such a check and removal, to avoid
* inconsistencies in the time interval from the end
* of this function to the start of the deferred work.
Expand All @@ -4748,9 +4787,26 @@ static void bfq_finish_request(struct request *rq)
bfqg_stats_update_io_remove(bfqq_group(bfqq),
rq->cmd_flags);
}
bfq_finish_request_body(bfqq);
bfq_finish_requeue_request_body(bfqq);
}

/*
* Reset private fields. In case of a requeue, this allows
* this function to correctly do nothing if it is spuriously
* invoked again on this same request (see the check at the
* beginning of the function). Probably, a better general
* design would be to prevent blk-mq from invoking the requeue
* or finish hooks of an elevator, for a request that is not
* referred by that elevator.
*
* Resetting the following fields would break the
* request-insertion logic if rq is re-inserted into a bfq
* internal queue, without a re-preparation. Here we assume
* that re-insertions of requeued requests, without
* re-preparation, can happen only for pass_through or at_head
* requests (which are not re-inserted into bfq internal
* queues).
*/
rq->elv.priv[0] = NULL;
rq->elv.priv[1] = NULL;
}
Expand Down Expand Up @@ -5426,7 +5482,8 @@ static struct elevator_type iosched_bfq_mq = {
.ops.mq = {
.limit_depth = bfq_limit_depth,
.prepare_request = bfq_prepare_request,
.finish_request = bfq_finish_request,
.requeue_request = bfq_finish_requeue_request,
.finish_request = bfq_finish_requeue_request,
.exit_icq = bfq_exit_icq,
.insert_requests = bfq_insert_requests,
.dispatch_request = bfq_dispatch_request,
Expand Down
11 changes: 10 additions & 1 deletion block/blk-core.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <linux/pm_runtime.h>
#include <linux/blk-cgroup.h>
#include <linux/debugfs.h>
#include <linux/bpf.h>

#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
Expand Down Expand Up @@ -2083,6 +2084,14 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
return false;
}

static noinline int should_fail_bio(struct bio *bio)
{
if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
return -EIO;
return 0;
}
ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);

/*
* Remap block n of partition p to block n+start(p) of the disk.
*/
Expand Down Expand Up @@ -2174,7 +2183,7 @@ generic_make_request_checks(struct bio *bio)
if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
goto not_supported;

if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
if (should_fail_bio(bio))
goto end_io;

if (!bio->bi_partno) {
Expand Down
10 changes: 9 additions & 1 deletion block/blk-wbt.c
Original file line number Diff line number Diff line change
Expand Up @@ -697,7 +697,15 @@ u64 wbt_default_latency_nsec(struct request_queue *q)

static int wbt_data_dir(const struct request *rq)
{
return rq_data_dir(rq);
const int op = req_op(rq);

if (op == REQ_OP_READ)
return READ;
else if (op == REQ_OP_WRITE || op == REQ_OP_FLUSH)
return WRITE;

/* don't account */
return -1;
}

int wbt_init(struct request_queue *q)
Expand Down
4 changes: 3 additions & 1 deletion drivers/md/bcache/alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -287,8 +287,10 @@ do { \
break; \
\
mutex_unlock(&(ca)->set->bucket_lock); \
if (kthread_should_stop()) \
if (kthread_should_stop()) { \
set_current_state(TASK_RUNNING); \
return 0; \
} \
\
schedule(); \
mutex_lock(&(ca)->set->bucket_lock); \
Expand Down
9 changes: 8 additions & 1 deletion drivers/md/bcache/bcache.h
Original file line number Diff line number Diff line change
Expand Up @@ -658,10 +658,15 @@ struct cache_set {
atomic_long_t writeback_keys_done;
atomic_long_t writeback_keys_failed;

atomic_long_t reclaim;
atomic_long_t flush_write;
atomic_long_t retry_flush_write;

enum {
ON_ERROR_UNREGISTER,
ON_ERROR_PANIC,
} on_error;
#define DEFAULT_IO_ERROR_LIMIT 8
unsigned error_limit;
unsigned error_decay;

Expand All @@ -675,6 +680,8 @@ struct cache_set {

#define BUCKET_HASH_BITS 12
struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];

DECLARE_HEAP(struct btree *, flush_btree);
};

struct bbio {
Expand Down Expand Up @@ -917,7 +924,7 @@ void bcache_write_super(struct cache_set *);

int bch_flash_dev_create(struct cache_set *c, uint64_t size);

int bch_cached_dev_attach(struct cached_dev *, struct cache_set *);
int bch_cached_dev_attach(struct cached_dev *, struct cache_set *, uint8_t *);
void bch_cached_dev_detach(struct cached_dev *);
void bch_cached_dev_run(struct cached_dev *);
void bcache_device_stop(struct bcache_device *);
Expand Down
9 changes: 6 additions & 3 deletions drivers/md/bcache/btree.c
Original file line number Diff line number Diff line change
Expand Up @@ -1869,14 +1869,17 @@ void bch_initial_gc_finish(struct cache_set *c)
*/
for_each_cache(ca, c, i) {
for_each_bucket(b, ca) {
if (fifo_full(&ca->free[RESERVE_PRIO]))
if (fifo_full(&ca->free[RESERVE_PRIO]) &&
fifo_full(&ca->free[RESERVE_BTREE]))
break;

if (bch_can_invalidate_bucket(ca, b) &&
!GC_MARK(b)) {
__bch_invalidate_one_bucket(ca, b);
fifo_push(&ca->free[RESERVE_PRIO],
b - ca->buckets);
if (!fifo_push(&ca->free[RESERVE_PRIO],
b - ca->buckets))
fifo_push(&ca->free[RESERVE_BTREE],
b - ca->buckets);
}
}
}
Expand Down
52 changes: 37 additions & 15 deletions drivers/md/bcache/journal.c
Original file line number Diff line number Diff line change
Expand Up @@ -368,35 +368,54 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
}

/* Journalling */
#define journal_max_cmp(l, r) \
(fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \
fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
#define journal_min_cmp(l, r) \
(fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \
fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))

static void btree_flush_write(struct cache_set *c)
{
/*
* Try to find the btree node with that references the oldest journal
* entry, best is our current candidate and is locked if non NULL:
*/
struct btree *b, *best;
unsigned i;
struct btree *b;
int i;

atomic_long_inc(&c->flush_write);

retry:
best = NULL;

for_each_cached_btree(b, c, i)
if (btree_current_write(b)->journal) {
if (!best)
best = b;
else if (journal_pin_cmp(c,
btree_current_write(best)->journal,
btree_current_write(b)->journal)) {
best = b;
spin_lock(&c->journal.lock);
if (heap_empty(&c->flush_btree)) {
for_each_cached_btree(b, c, i)
if (btree_current_write(b)->journal) {
if (!heap_full(&c->flush_btree))
heap_add(&c->flush_btree, b,
journal_max_cmp);
else if (journal_max_cmp(b,
heap_peek(&c->flush_btree))) {
c->flush_btree.data[0] = b;
heap_sift(&c->flush_btree, 0,
journal_max_cmp);
}
}
}

b = best;
for (i = c->flush_btree.used / 2 - 1; i >= 0; --i)
heap_sift(&c->flush_btree, i, journal_min_cmp);
}

b = NULL;
heap_pop(&c->flush_btree, b, journal_min_cmp);
spin_unlock(&c->journal.lock);

if (b) {
mutex_lock(&b->write_lock);
if (!btree_current_write(b)->journal) {
mutex_unlock(&b->write_lock);
/* We raced */
atomic_long_inc(&c->retry_flush_write);
goto retry;
}

Expand Down Expand Up @@ -476,6 +495,8 @@ static void journal_reclaim(struct cache_set *c)
unsigned iter, n = 0;
atomic_t p;

atomic_long_inc(&c->reclaim);

while (!atomic_read(&fifo_front(&c->journal.pin)))
fifo_pop(&c->journal.pin, p);

Expand Down Expand Up @@ -819,7 +840,8 @@ int bch_journal_alloc(struct cache_set *c)
j->w[0].c = c;
j->w[1].c = c;

if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) ||
!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
!(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
!(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
return -ENOMEM;
Expand Down
Loading

0 comments on commit 9454473

Please sign in to comment.