Skip to content

Commit

Permalink
md/raid5: activate raid6 rmw feature
Browse files Browse the repository at this point in the history
Glue it altogehter. The raid6 rmw path should work the same as the
already existing raid5 logic. So emulate the prexor handling/flags
and split functions as needed.

1) Enable xor_syndrome() in the async layer.

2) Split ops_run_prexor() into RAID4/5 and RAID6 logic. Xor the syndrome
at the start of a rmw run as we did it before for the single parity.

3) Take care of rmw run in ops_run_reconstruct6(). Again process only
the changed pages to get syndrome back into sync.

4) Enhance set_syndrome_sources() to fill NULL pages if we are in a rmw
run. The lower layers will calculate start & end pages from that and
call the xor_syndrome() correspondingly.

5) Adapt the several places where we ignored Q handling up to now.

Performance numbers for a single E5630 system with a mix of 10 7200k
desktop/server disks. 300 seconds random write with 8 threads onto a
3,2TB (10*400GB) RAID6 64K chunk without spare (group_thread_cnt=4)

bsize   rmw_level=1   rmw_level=0   rmw_level=1   rmw_level=0
        skip_copy=1   skip_copy=1   skip_copy=0   skip_copy=0
   4K      115 KB/s      141 KB/s      165 KB/s      140 KB/s
   8K      225 KB/s      275 KB/s      324 KB/s      274 KB/s
  16K      434 KB/s      536 KB/s      640 KB/s      534 KB/s
  32K      751 KB/s    1,051 KB/s    1,234 KB/s    1,045 KB/s
  64K    1,339 KB/s    1,958 KB/s    2,282 KB/s    1,962 KB/s
 128K    2,673 KB/s    3,862 KB/s    4,113 KB/s    3,898 KB/s
 256K    7,685 KB/s    7,539 KB/s    7,557 KB/s    7,638 KB/s
 512K   19,556 KB/s   19,558 KB/s   19,652 KB/s   19,688 Kb/s

Signed-off-by: Markus Stockhausen <[email protected]>
Signed-off-by: NeilBrown <[email protected]>
  • Loading branch information
Markus Stockhausen authored and neilbrown committed Apr 21, 2015
1 parent a582564 commit 584acdd
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 30 deletions.
19 changes: 16 additions & 3 deletions crypto/async_tx/async_pq.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
{
void **srcs;
int i;
int start = -1, stop = disks - 3;

if (submit->scribble)
srcs = submit->scribble;
Expand All @@ -134,10 +135,21 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
if (blocks[i] == NULL) {
BUG_ON(i > disks - 3); /* P or Q can't be zero */
srcs[i] = (void*)raid6_empty_zero_page;
} else
} else {
srcs[i] = page_address(blocks[i]) + offset;
if (i < disks - 2) {
stop = i;
if (start == -1)
start = i;
}
}
}
raid6_call.gen_syndrome(disks, len, srcs);
if (submit->flags & ASYNC_TX_PQ_XOR_DST) {
BUG_ON(!raid6_call.xor_syndrome);
if (start >= 0)
raid6_call.xor_syndrome(disks, start, stop, len, srcs);
} else
raid6_call.gen_syndrome(disks, len, srcs);
async_tx_sync_epilog(submit);
}

Expand Down Expand Up @@ -178,7 +190,8 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
if (device)
unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO);

if (unmap &&
/* XORing P/Q is only implemented in software */
if (unmap && !(submit->flags & ASYNC_TX_PQ_XOR_DST) &&
(src_cnt <= dma_maxpq(device, 0) ||
dma_maxpq(device, DMA_PREP_CONTINUE) > 0) &&
is_dma_pq_aligned(device, offset, 0, len)) {
Expand Down
104 changes: 78 additions & 26 deletions drivers/md/raid5.c
Original file line number Diff line number Diff line change
Expand Up @@ -1317,7 +1317,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
* destination buffer is recorded in srcs[count] and the Q destination
* is recorded in srcs[count+1]].
*/
static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
static int set_syndrome_sources(struct page **srcs,
struct stripe_head *sh,
int srctype)
{
int disks = sh->disks;
int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
Expand All @@ -1332,8 +1334,15 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
i = d0_idx;
do {
int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
struct r5dev *dev = &sh->dev[i];

srcs[slot] = sh->dev[i].page;
if (i == sh->qd_idx || i == sh->pd_idx ||
(srctype == SYNDROME_SRC_ALL) ||
(srctype == SYNDROME_SRC_WANT_DRAIN &&
test_bit(R5_Wantdrain, &dev->flags)) ||
(srctype == SYNDROME_SRC_WRITTEN &&
dev->written))
srcs[slot] = sh->dev[i].page;
i = raid6_next_disk(i, disks);
} while (i != d0_idx);

Expand Down Expand Up @@ -1373,7 +1382,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
atomic_inc(&sh->count);

if (target == qd_idx) {
count = set_syndrome_sources(blocks, sh);
count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
blocks[count] = NULL; /* regenerating p is not necessary */
BUG_ON(blocks[count+1] != dest); /* q should already be set */
init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
Expand Down Expand Up @@ -1481,7 +1490,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
&submit);

count = set_syndrome_sources(blocks, sh);
count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
init_async_submit(&submit, ASYNC_TX_FENCE, tx,
ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
Expand Down Expand Up @@ -1515,8 +1524,8 @@ static void ops_complete_prexor(void *stripe_head_ref)
}

static struct dma_async_tx_descriptor *
ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
struct dma_async_tx_descriptor *tx)
ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
struct dma_async_tx_descriptor *tx)
{
int disks = sh->disks;
struct page **xor_srcs = to_addr_page(percpu, 0);
Expand Down Expand Up @@ -1544,6 +1553,26 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
return tx;
}

static struct dma_async_tx_descriptor *
ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
struct dma_async_tx_descriptor *tx)
{
struct page **blocks = to_addr_page(percpu, 0);
int count;
struct async_submit_ctl submit;

pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);

count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);

init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);

return tx;
}

static struct dma_async_tx_descriptor *
ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
{
Expand Down Expand Up @@ -1746,6 +1775,8 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
int count, i, j = 0;
struct stripe_head *head_sh = sh;
int last_stripe;
int synflags;
unsigned long txflags;

pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);

Expand All @@ -1765,14 +1796,23 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,

again:
blocks = to_addr_page(percpu, j);
count = set_syndrome_sources(blocks, sh);

if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
synflags = SYNDROME_SRC_WRITTEN;
txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
} else {
synflags = SYNDROME_SRC_ALL;
txflags = ASYNC_TX_ACK;
}

count = set_syndrome_sources(blocks, sh, synflags);
last_stripe = !head_sh->batch_head ||
list_first_entry(&sh->batch_list,
struct stripe_head, batch_list) == head_sh;

if (last_stripe) {
atomic_inc(&head_sh->count);
init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
head_sh, to_addr_conv(sh, percpu, j));
} else
init_async_submit(&submit, 0, tx, NULL, NULL,
Expand Down Expand Up @@ -1843,7 +1883,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
(unsigned long long)sh->sector, checkp);

BUG_ON(sh->batch_head);
count = set_syndrome_sources(srcs, sh);
count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
if (!checkp)
srcs[count] = NULL;

Expand Down Expand Up @@ -1884,8 +1924,12 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
async_tx_ack(tx);
}

if (test_bit(STRIPE_OP_PREXOR, &ops_request))
tx = ops_run_prexor(sh, percpu, tx);
if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
if (level < 6)
tx = ops_run_prexor5(sh, percpu, tx);
else
tx = ops_run_prexor6(sh, percpu, tx);
}

if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
tx = ops_run_biodrain(sh, tx);
Expand Down Expand Up @@ -2770,7 +2814,7 @@ static void
schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
int rcw, int expand)
{
int i, pd_idx = sh->pd_idx, disks = sh->disks;
int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
struct r5conf *conf = sh->raid_conf;
int level = conf->level;

Expand Down Expand Up @@ -2806,13 +2850,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
atomic_inc(&conf->pending_full_writes);
} else {
BUG_ON(level == 6);
BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
BUG_ON(level == 6 &&
(!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));

for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (i == pd_idx)
if (i == pd_idx || i == qd_idx)
continue;

if (dev->towrite &&
Expand Down Expand Up @@ -3476,28 +3522,27 @@ static void handle_stripe_dirtying(struct r5conf *conf,
int rmw = 0, rcw = 0, i;
sector_t recovery_cp = conf->mddev->recovery_cp;

/* RAID6 requires 'rcw' in current implementation.
* Otherwise, check whether resync is now happening or should start.
/* Check whether resync is now happening or should start.
* If yes, then the array is dirty (after unclean shutdown or
* initial creation), so parity in some stripes might be inconsistent.
* In this case, we need to always do reconstruct-write, to ensure
* that in case of drive failure or read-error correction, we
* generate correct data from the parity.
*/
if (conf->max_degraded == 2 ||
if (conf->rmw_level == PARITY_DISABLE_RMW ||
(recovery_cp < MaxSector && sh->sector >= recovery_cp &&
s->failed == 0)) {
/* Calculate the real rcw later - for now make it
* look like rcw is cheaper
*/
rcw = 1; rmw = 2;
pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
conf->max_degraded, (unsigned long long)recovery_cp,
pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
conf->rmw_level, (unsigned long long)recovery_cp,
(unsigned long long)sh->sector);
} else for (i = disks; i--; ) {
/* would I have to read this buffer for read_modify_write */
struct r5dev *dev = &sh->dev[i];
if ((dev->towrite || i == sh->pd_idx) &&
if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
!test_bit(R5_LOCKED, &dev->flags) &&
!(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags))) {
Expand All @@ -3507,7 +3552,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
rmw += 2*disks; /* cannot read it */
}
/* Would I have to read this buffer for reconstruct_write */
if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
if (!test_bit(R5_OVERWRITE, &dev->flags) &&
i != sh->pd_idx && i != sh->qd_idx &&
!test_bit(R5_LOCKED, &dev->flags) &&
!(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags))) {
Expand All @@ -3520,15 +3566,15 @@ static void handle_stripe_dirtying(struct r5conf *conf,
pr_debug("for sector %llu, rmw=%d rcw=%d\n",
(unsigned long long)sh->sector, rmw, rcw);
set_bit(STRIPE_HANDLE, &sh->state);
if (rmw < rcw && rmw > 0) {
if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
if (conf->mddev->queue)
blk_add_trace_msg(conf->mddev->queue,
"raid5 rmw %llu %d",
(unsigned long long)sh->sector, rmw);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if ((dev->towrite || i == sh->pd_idx) &&
if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
!test_bit(R5_LOCKED, &dev->flags) &&
!(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags)) &&
Expand All @@ -3547,7 +3593,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
}
}
}
if (rcw <= rmw && rcw > 0) {
if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) {
/* want reconstruct write, but need to get some data */
int qread =0;
rcw = 0;
Expand Down Expand Up @@ -6344,10 +6390,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
}

conf->level = mddev->new_level;
if (conf->level == 6)
if (conf->level == 6) {
conf->max_degraded = 2;
else
if (raid6_call.xor_syndrome)
conf->rmw_level = PARITY_ENABLE_RMW;
else
conf->rmw_level = PARITY_DISABLE_RMW;
} else {
conf->max_degraded = 1;
conf->rmw_level = PARITY_ENABLE_RMW;
}
conf->algorithm = mddev->new_layout;
conf->reshape_progress = mddev->reshape_position;
if (conf->reshape_progress != MaxSector) {
Expand Down
19 changes: 18 additions & 1 deletion drivers/md/raid5.h
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,23 @@ enum {
STRIPE_OP_RECONSTRUCT,
STRIPE_OP_CHECK,
};

/*
* RAID parity calculation preferences
*/
enum {
PARITY_DISABLE_RMW = 0,
PARITY_ENABLE_RMW,
};

/*
* Pages requested from set_syndrome_sources()
*/
enum {
SYNDROME_SRC_ALL,
SYNDROME_SRC_WANT_DRAIN,
SYNDROME_SRC_WRITTEN,
};
/*
* Plugging:
*
Expand Down Expand Up @@ -411,7 +428,7 @@ struct r5conf {
spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS];
struct mddev *mddev;
int chunk_sectors;
int level, algorithm;
int level, algorithm, rmw_level;
int max_degraded;
int raid_disks;
int max_nr_stripes;
Expand Down
3 changes: 3 additions & 0 deletions include/linux/async_tx.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,15 @@ struct dma_chan_ref {
* dependency chain
* @ASYNC_TX_FENCE: specify that the next operation in the dependency
* chain uses this operation's result as an input
* @ASYNC_TX_PQ_XOR_DST: do not overwrite the syndrome but XOR it with the
* input data. Required for rmw case.
*/
enum async_tx_flags {
ASYNC_TX_XOR_ZERO_DST = (1 << 0),
ASYNC_TX_XOR_DROP_DST = (1 << 1),
ASYNC_TX_ACK = (1 << 2),
ASYNC_TX_FENCE = (1 << 3),
ASYNC_TX_PQ_XOR_DST = (1 << 4),
};

/**
Expand Down

0 comments on commit 584acdd

Please sign in to comment.