Skip to content

Commit

Permalink
cfq-iosched: add close cooperator code
Browse files Browse the repository at this point in the history
If we have processes that are working in close proximity to each
other on disk, we don't want to idle wait. Instead allow the close
process to issue a request, getting better aggregate bandwidth.
The anticipatory scheduler has similar checks, noop and deadline do
not need it since they don't care about process <-> io mappings.

The code for CFQ is a little more involved though, since we split
request queues into per-process contexts.

This fixes a performance problem with eg dump(8), since it uses
several processes in some silly attempt to speed IO up. Even if
dump(8) isn't really a valid case (it should be fixed by using
CLONE_IO), there are other cases where we see close processes
and where idling ends up hurting performance.

Credit goes to Jeff Moyer <[email protected]> for writing the
initial implementation.

Signed-off-by: Jens Axboe <[email protected]>
  • Loading branch information
Jens Axboe committed Apr 15, 2009
1 parent 9481ffd commit a36e71f
Showing 1 changed file with 198 additions and 25 deletions.
223 changes: 198 additions & 25 deletions block/cfq-iosched.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,14 @@ struct cfq_data {
* rr list of queues with requests and the count of them
*/
struct cfq_rb_root service_tree;

/*
* Each priority tree is sorted by next_request position. These
* trees are used when determining if two or more queues are
* interleaving requests (see cfq_close_cooperator).
*/
struct rb_root prio_trees[CFQ_PRIO_LISTS];

unsigned int busy_queues;
/*
* Used to track any pending rt requests so we can pre-empt current
Expand Down Expand Up @@ -144,6 +152,8 @@ struct cfq_queue {
struct rb_node rb_node;
/* service_tree key */
unsigned long rb_key;
/* prio tree member */
struct rb_node p_node;
/* sorted list of pending requests */
struct rb_root sort_list;
/* if fifo isn't expired, next request to serve */
Expand Down Expand Up @@ -182,6 +192,7 @@ enum cfqq_state_flags {
CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
CFQ_CFQQ_FLAG_sync, /* synchronous queue */
CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */
};

#define CFQ_CFQQ_FNS(name) \
Expand All @@ -208,6 +219,7 @@ CFQ_CFQQ_FNS(idle_window);
CFQ_CFQQ_FNS(prio_changed);
CFQ_CFQQ_FNS(slice_new);
CFQ_CFQQ_FNS(sync);
CFQ_CFQQ_FNS(coop);
#undef CFQ_CFQQ_FNS

#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
Expand Down Expand Up @@ -416,13 +428,17 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
return NULL;
}

static void rb_erase_init(struct rb_node *n, struct rb_root *root)
{
rb_erase(n, root);
RB_CLEAR_NODE(n);
}

static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
{
if (root->left == n)
root->left = NULL;

rb_erase(n, &root->rb);
RB_CLEAR_NODE(n);
rb_erase_init(n, &root->rb);
}

/*
Expand Down Expand Up @@ -467,8 +483,8 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
* requests waiting to be processed. It is sorted in the order that
* we will service the queues.
*/
static void cfq_service_tree_add(struct cfq_data *cfqd,
struct cfq_queue *cfqq, int add_front)
static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
int add_front)
{
struct rb_node **p, *parent;
struct cfq_queue *__cfqq;
Expand Down Expand Up @@ -541,6 +557,63 @@ static void cfq_service_tree_add(struct cfq_data *cfqd,
rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
}

static struct cfq_queue *
cfq_prio_tree_lookup(struct cfq_data *cfqd, int ioprio, sector_t sector,
struct rb_node **ret_parent, struct rb_node ***rb_link)
{
struct rb_root *root = &cfqd->prio_trees[ioprio];
struct rb_node **p, *parent;
struct cfq_queue *cfqq = NULL;

parent = NULL;
p = &root->rb_node;
while (*p) {
struct rb_node **n;

parent = *p;
cfqq = rb_entry(parent, struct cfq_queue, p_node);

/*
* Sort strictly based on sector. Smallest to the left,
* largest to the right.
*/
if (sector > cfqq->next_rq->sector)
n = &(*p)->rb_right;
else if (sector < cfqq->next_rq->sector)
n = &(*p)->rb_left;
else
break;
p = n;
}

*ret_parent = parent;
if (rb_link)
*rb_link = p;
return NULL;
}

static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
struct rb_root *root = &cfqd->prio_trees[cfqq->ioprio];
struct rb_node **p, *parent;
struct cfq_queue *__cfqq;

if (!RB_EMPTY_NODE(&cfqq->p_node))
rb_erase_init(&cfqq->p_node, root);

if (cfq_class_idle(cfqq))
return;
if (!cfqq->next_rq)
return;

__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->ioprio, cfqq->next_rq->sector,
&parent, &p);
BUG_ON(__cfqq);

rb_link_node(&cfqq->p_node, parent, p);
rb_insert_color(&cfqq->p_node, root);
}

/*
* Update cfqq's position in the service tree.
*/
Expand All @@ -549,8 +622,10 @@ static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
/*
* Resorting requires the cfqq to be on the RR list already.
*/
if (cfq_cfqq_on_rr(cfqq))
if (cfq_cfqq_on_rr(cfqq)) {
cfq_service_tree_add(cfqd, cfqq, 0);
cfq_prio_tree_add(cfqd, cfqq);
}
}

/*
Expand Down Expand Up @@ -581,6 +656,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)

if (!RB_EMPTY_NODE(&cfqq->rb_node))
cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
if (!RB_EMPTY_NODE(&cfqq->p_node))
rb_erase_init(&cfqq->p_node, &cfqd->prio_trees[cfqq->ioprio]);

BUG_ON(!cfqd->busy_queues);
cfqd->busy_queues--;
Expand Down Expand Up @@ -610,7 +687,7 @@ static void cfq_add_rq_rb(struct request *rq)
{
struct cfq_queue *cfqq = RQ_CFQQ(rq);
struct cfq_data *cfqd = cfqq->cfqd;
struct request *__alias;
struct request *__alias, *prev;

cfqq->queued[rq_is_sync(rq)]++;

Expand All @@ -627,7 +704,15 @@ static void cfq_add_rq_rb(struct request *rq)
/*
* check if this request is a better next-serve candidate
*/
prev = cfqq->next_rq;
cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);

/*
* adjust priority tree position, if ->next_rq changes
*/
if (prev != cfqq->next_rq)
cfq_prio_tree_add(cfqd, cfqq);

BUG_ON(!cfqq->next_rq);
}

Expand Down Expand Up @@ -840,11 +925,15 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
/*
* Get and set a new active queue for service.
*/
static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
struct cfq_queue *cfqq)
{
struct cfq_queue *cfqq;
if (!cfqq) {
cfqq = cfq_get_next_queue(cfqd);
if (cfqq)
cfq_clear_cfqq_coop(cfqq);
}

cfqq = cfq_get_next_queue(cfqd);
__cfq_set_active_queue(cfqd, cfqq);
return cfqq;
}
Expand All @@ -868,17 +957,89 @@ static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean;
}

static int cfq_close_cooperator(struct cfq_data *cfq_data,
struct cfq_queue *cfqq)
static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
struct cfq_queue *cur_cfqq)
{
struct rb_root *root = &cfqd->prio_trees[cur_cfqq->ioprio];
struct rb_node *parent, *node;
struct cfq_queue *__cfqq;
sector_t sector = cfqd->last_position;

if (RB_EMPTY_ROOT(root))
return NULL;

/*
* First, if we find a request starting at the end of the last
* request, choose it.
*/
__cfqq = cfq_prio_tree_lookup(cfqd, cur_cfqq->ioprio,
sector, &parent, NULL);
if (__cfqq)
return __cfqq;

/*
* If the exact sector wasn't found, the parent of the NULL leaf
* will contain the closest sector.
*/
__cfqq = rb_entry(parent, struct cfq_queue, p_node);
if (cfq_rq_close(cfqd, __cfqq->next_rq))
return __cfqq;

if (__cfqq->next_rq->sector < sector)
node = rb_next(&__cfqq->p_node);
else
node = rb_prev(&__cfqq->p_node);
if (!node)
return NULL;

__cfqq = rb_entry(node, struct cfq_queue, p_node);
if (cfq_rq_close(cfqd, __cfqq->next_rq))
return __cfqq;

return NULL;
}

/*
* cfqd - obvious
* cur_cfqq - passed in so that we don't decide that the current queue is
* closely cooperating with itself.
*
* So, basically we're assuming that that cur_cfqq has dispatched at least
* one request, and that cfqd->last_position reflects a position on the disk
* associated with the I/O issued by cur_cfqq. I'm not sure this is a valid
* assumption.
*/
static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
struct cfq_queue *cur_cfqq,
int probe)
{
struct cfq_queue *cfqq;

/*
* A valid cfq_io_context is necessary to compare requests against
* the seek_mean of the current cfqq.
*/
if (!cfqd->active_cic)
return NULL;

/*
* We should notice if some of the queues are cooperating, eg
* working closely on the same area of the disk. In that case,
* we can group them together and don't waste time idling.
*/
return 0;
cfqq = cfqq_close(cfqd, cur_cfqq);
if (!cfqq)
return NULL;

if (cfq_cfqq_coop(cfqq))
return NULL;

if (!probe)
cfq_mark_cfqq_coop(cfqq);
return cfqq;
}


#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))

static void cfq_arm_slice_timer(struct cfq_data *cfqd)
Expand Down Expand Up @@ -917,13 +1078,6 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
if (!cic || !atomic_read(&cic->ioc->nr_tasks))
return;

/*
* See if this prio level has a good candidate
*/
if (cfq_close_cooperator(cfqd, cfqq) &&
(sample_valid(cic->ttime_samples) && cic->ttime_mean > 2))
return;

cfq_mark_cfqq_wait_request(cfqq);

/*
Expand Down Expand Up @@ -1000,7 +1154,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
*/
static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
{
struct cfq_queue *cfqq;
struct cfq_queue *cfqq, *new_cfqq = NULL;

cfqq = cfqd->active_queue;
if (!cfqq)
Expand Down Expand Up @@ -1033,6 +1187,16 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
if (!RB_EMPTY_ROOT(&cfqq->sort_list))
goto keep_queue;

/*
* If another queue has a request waiting within our mean seek
* distance, let it run. The expire code will check for close
* cooperators and put the close queue at the front of the service
* tree.
*/
new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0);
if (new_cfqq)
goto expire;

/*
* No requests pending. If the active queue still has requests in
* flight or is idling for a new request, allow either of these
Expand All @@ -1047,7 +1211,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
expire:
cfq_slice_expired(cfqd, 0);
new_queue:
cfqq = cfq_set_active_queue(cfqd);
cfqq = cfq_set_active_queue(cfqd, new_cfqq);
keep_queue:
return cfqq;
}
Expand Down Expand Up @@ -1508,6 +1672,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync,
}

RB_CLEAR_NODE(&cfqq->rb_node);
RB_CLEAR_NODE(&cfqq->p_node);
INIT_LIST_HEAD(&cfqq->fifo);

atomic_set(&cfqq->ref, 0);
Expand Down Expand Up @@ -2000,16 +2165,24 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
* or if we want to idle in case it has no pending requests.
*/
if (cfqd->active_queue == cfqq) {
const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);

if (cfq_cfqq_slice_new(cfqq)) {
cfq_set_prio_slice(cfqd, cfqq);
cfq_clear_cfqq_slice_new(cfqq);
}
/*
* If there are no requests waiting in this queue, and
* there are other queues ready to issue requests, AND
* those other queues are issuing requests within our
* mean seek distance, give them a chance to run instead
* of idling.
*/
if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
cfq_slice_expired(cfqd, 1);
else if (sync && !rq_noidle(rq) &&
RB_EMPTY_ROOT(&cfqq->sort_list)) {
else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) &&
sync && !rq_noidle(rq))
cfq_arm_slice_timer(cfqd);
}
}

if (!cfqd->rq_in_driver)
Expand Down

0 comments on commit a36e71f

Please sign in to comment.