forked from torvalds/linux
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
md/raid6: implement recovery using ARM NEON intrinsics
Provide a NEON accelerated implementation of the recovery algorithm, which supersedes the default byte-by-byte one. Signed-off-by: Ard Biesheuvel <[email protected]> Signed-off-by: Catalin Marinas <[email protected]>
- Loading branch information
Showing
5 changed files
with
234 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
/* | ||
* Copyright (C) 2012 Intel Corporation | ||
* Copyright (C) 2017 Linaro Ltd. <[email protected]> | ||
* | ||
* This program is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU General Public License | ||
* as published by the Free Software Foundation; version 2 | ||
* of the License. | ||
*/ | ||
|
||
#include <linux/raid/pq.h> | ||
|
||
#ifdef __KERNEL__ | ||
#include <asm/neon.h> | ||
#else | ||
#define kernel_neon_begin() | ||
#define kernel_neon_end() | ||
#define cpu_has_neon() (1) | ||
#endif | ||
|
||
static int raid6_has_neon(void) | ||
{ | ||
return cpu_has_neon(); | ||
} | ||
|
||
void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, | ||
uint8_t *dq, const uint8_t *pbmul, | ||
const uint8_t *qmul); | ||
|
||
void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, | ||
const uint8_t *qmul); | ||
|
||
static void raid6_2data_recov_neon(int disks, size_t bytes, int faila, | ||
int failb, void **ptrs) | ||
{ | ||
u8 *p, *q, *dp, *dq; | ||
const u8 *pbmul; /* P multiplier table for B data */ | ||
const u8 *qmul; /* Q multiplier table (for both) */ | ||
|
||
p = (u8 *)ptrs[disks - 2]; | ||
q = (u8 *)ptrs[disks - 1]; | ||
|
||
/* | ||
* Compute syndrome with zero for the missing data pages | ||
* Use the dead data pages as temporary storage for | ||
* delta p and delta q | ||
*/ | ||
dp = (u8 *)ptrs[faila]; | ||
ptrs[faila] = (void *)raid6_empty_zero_page; | ||
ptrs[disks - 2] = dp; | ||
dq = (u8 *)ptrs[failb]; | ||
ptrs[failb] = (void *)raid6_empty_zero_page; | ||
ptrs[disks - 1] = dq; | ||
|
||
raid6_call.gen_syndrome(disks, bytes, ptrs); | ||
|
||
/* Restore pointer table */ | ||
ptrs[faila] = dp; | ||
ptrs[failb] = dq; | ||
ptrs[disks - 2] = p; | ||
ptrs[disks - 1] = q; | ||
|
||
/* Now, pick the proper data tables */ | ||
pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; | ||
qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ | ||
raid6_gfexp[failb]]]; | ||
|
||
kernel_neon_begin(); | ||
__raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul); | ||
kernel_neon_end(); | ||
} | ||
|
||
static void raid6_datap_recov_neon(int disks, size_t bytes, int faila, | ||
void **ptrs) | ||
{ | ||
u8 *p, *q, *dq; | ||
const u8 *qmul; /* Q multiplier table */ | ||
|
||
p = (u8 *)ptrs[disks - 2]; | ||
q = (u8 *)ptrs[disks - 1]; | ||
|
||
/* | ||
* Compute syndrome with zero for the missing data page | ||
* Use the dead data page as temporary storage for delta q | ||
*/ | ||
dq = (u8 *)ptrs[faila]; | ||
ptrs[faila] = (void *)raid6_empty_zero_page; | ||
ptrs[disks - 1] = dq; | ||
|
||
raid6_call.gen_syndrome(disks, bytes, ptrs); | ||
|
||
/* Restore pointer table */ | ||
ptrs[faila] = dq; | ||
ptrs[disks - 1] = q; | ||
|
||
/* Now, pick the proper data tables */ | ||
qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; | ||
|
||
kernel_neon_begin(); | ||
__raid6_datap_recov_neon(bytes, p, q, dq, qmul); | ||
kernel_neon_end(); | ||
} | ||
|
||
const struct raid6_recov_calls raid6_recov_neon = { | ||
.data2 = raid6_2data_recov_neon, | ||
.datap = raid6_datap_recov_neon, | ||
.valid = raid6_has_neon, | ||
.name = "neon", | ||
.priority = 10, | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
/* | ||
* Copyright (C) 2012 Intel Corporation | ||
* Copyright (C) 2017 Linaro Ltd. <[email protected]> | ||
* | ||
* This program is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU General Public License | ||
* as published by the Free Software Foundation; version 2 | ||
* of the License. | ||
*/ | ||
|
||
#include <arm_neon.h> | ||
|
||
static const uint8x16_t x0f = { | ||
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, | ||
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, | ||
}; | ||
|
||
#ifdef CONFIG_ARM | ||
/* | ||
* AArch32 does not provide this intrinsic natively because it does not | ||
* implement the underlying instruction. AArch32 only provides a 64-bit | ||
* wide vtbl.8 instruction, so use that instead. | ||
*/ | ||
static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b) | ||
{ | ||
union { | ||
uint8x16_t val; | ||
uint8x8x2_t pair; | ||
} __a = { a }; | ||
|
||
return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)), | ||
vtbl2_u8(__a.pair, vget_high_u8(b))); | ||
} | ||
#endif | ||
|
||
void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, | ||
uint8_t *dq, const uint8_t *pbmul, | ||
const uint8_t *qmul) | ||
{ | ||
uint8x16_t pm0 = vld1q_u8(pbmul); | ||
uint8x16_t pm1 = vld1q_u8(pbmul + 16); | ||
uint8x16_t qm0 = vld1q_u8(qmul); | ||
uint8x16_t qm1 = vld1q_u8(qmul + 16); | ||
|
||
/* | ||
* while ( bytes-- ) { | ||
* uint8_t px, qx, db; | ||
* | ||
* px = *p ^ *dp; | ||
* qx = qmul[*q ^ *dq]; | ||
* *dq++ = db = pbmul[px] ^ qx; | ||
* *dp++ = db ^ px; | ||
* p++; q++; | ||
* } | ||
*/ | ||
|
||
while (bytes) { | ||
uint8x16_t vx, vy, px, qx, db; | ||
|
||
px = veorq_u8(vld1q_u8(p), vld1q_u8(dp)); | ||
vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); | ||
|
||
vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4); | ||
vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); | ||
vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f)); | ||
qx = veorq_u8(vx, vy); | ||
|
||
vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4); | ||
vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f)); | ||
vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f)); | ||
vx = veorq_u8(vx, vy); | ||
db = veorq_u8(vx, qx); | ||
|
||
vst1q_u8(dq, db); | ||
vst1q_u8(dp, veorq_u8(db, px)); | ||
|
||
bytes -= 16; | ||
p += 16; | ||
q += 16; | ||
dp += 16; | ||
dq += 16; | ||
} | ||
} | ||
|
||
void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, | ||
const uint8_t *qmul) | ||
{ | ||
uint8x16_t qm0 = vld1q_u8(qmul); | ||
uint8x16_t qm1 = vld1q_u8(qmul + 16); | ||
|
||
/* | ||
* while (bytes--) { | ||
* *p++ ^= *dq = qmul[*q ^ *dq]; | ||
* q++; dq++; | ||
* } | ||
*/ | ||
|
||
while (bytes) { | ||
uint8x16_t vx, vy; | ||
|
||
vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); | ||
|
||
vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4); | ||
vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); | ||
vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f)); | ||
vx = veorq_u8(vx, vy); | ||
vy = veorq_u8(vx, vld1q_u8(p)); | ||
|
||
vst1q_u8(dq, vx); | ||
vst1q_u8(p, vy); | ||
|
||
bytes -= 16; | ||
p += 16; | ||
q += 16; | ||
dq += 16; | ||
} | ||
} |