Skip to content

Commit 6ec4e25

Browse files
Ard Biesheuvelctmarinas
Ard Biesheuvel
authored andcommittedAug 9, 2017
md/raid6: implement recovery using ARM NEON intrinsics
Provide a NEON accelerated implementation of the recovery algorithm, which supersedes the default byte-by-byte one. Signed-off-by: Ard Biesheuvel <[email protected]> Signed-off-by: Catalin Marinas <[email protected]>
1 parent 35129dd commit 6ec4e25

File tree

5 files changed

+234
-1
lines changed

5 files changed

+234
-1
lines changed
 

‎include/linux/raid/pq.h

+1
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ extern const struct raid6_recov_calls raid6_recov_ssse3;
121121
extern const struct raid6_recov_calls raid6_recov_avx2;
122122
extern const struct raid6_recov_calls raid6_recov_avx512;
123123
extern const struct raid6_recov_calls raid6_recov_s390xc;
124+
extern const struct raid6_recov_calls raid6_recov_neon;
124125

125126
extern const struct raid6_calls raid6_neonx1;
126127
extern const struct raid6_calls raid6_neonx2;

‎lib/raid6/Makefile

+3-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \
55

66
raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o
77
raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
8-
raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
8+
raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
99
raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
1010
raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
1111

@@ -26,7 +26,9 @@ NEON_FLAGS := -ffreestanding
2626
ifeq ($(ARCH),arm)
2727
NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon
2828
endif
29+
CFLAGS_recov_neon_inner.o += $(NEON_FLAGS)
2930
ifeq ($(ARCH),arm64)
31+
CFLAGS_REMOVE_recov_neon_inner.o += -mgeneral-regs-only
3032
CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only
3133
CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only
3234
CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only

‎lib/raid6/algos.c

+3
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
112112
#endif
113113
#ifdef CONFIG_S390
114114
&raid6_recov_s390xc,
115+
#endif
116+
#if defined(CONFIG_KERNEL_MODE_NEON)
117+
&raid6_recov_neon,
115118
#endif
116119
&raid6_recov_intx1,
117120
NULL

‎lib/raid6/recov_neon.c

+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
/*
2+
* Copyright (C) 2012 Intel Corporation
3+
* Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
4+
*
5+
* This program is free software; you can redistribute it and/or
6+
* modify it under the terms of the GNU General Public License
7+
* as published by the Free Software Foundation; version 2
8+
* of the License.
9+
*/
10+
11+
#include <linux/raid/pq.h>
12+
13+
#ifdef __KERNEL__
14+
#include <asm/neon.h>
15+
#else
16+
#define kernel_neon_begin()
17+
#define kernel_neon_end()
18+
#define cpu_has_neon() (1)
19+
#endif
20+
21+
static int raid6_has_neon(void)
22+
{
23+
return cpu_has_neon();
24+
}
25+
26+
void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
27+
uint8_t *dq, const uint8_t *pbmul,
28+
const uint8_t *qmul);
29+
30+
void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
31+
const uint8_t *qmul);
32+
33+
static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
34+
int failb, void **ptrs)
35+
{
36+
u8 *p, *q, *dp, *dq;
37+
const u8 *pbmul; /* P multiplier table for B data */
38+
const u8 *qmul; /* Q multiplier table (for both) */
39+
40+
p = (u8 *)ptrs[disks - 2];
41+
q = (u8 *)ptrs[disks - 1];
42+
43+
/*
44+
* Compute syndrome with zero for the missing data pages
45+
* Use the dead data pages as temporary storage for
46+
* delta p and delta q
47+
*/
48+
dp = (u8 *)ptrs[faila];
49+
ptrs[faila] = (void *)raid6_empty_zero_page;
50+
ptrs[disks - 2] = dp;
51+
dq = (u8 *)ptrs[failb];
52+
ptrs[failb] = (void *)raid6_empty_zero_page;
53+
ptrs[disks - 1] = dq;
54+
55+
raid6_call.gen_syndrome(disks, bytes, ptrs);
56+
57+
/* Restore pointer table */
58+
ptrs[faila] = dp;
59+
ptrs[failb] = dq;
60+
ptrs[disks - 2] = p;
61+
ptrs[disks - 1] = q;
62+
63+
/* Now, pick the proper data tables */
64+
pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
65+
qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
66+
raid6_gfexp[failb]]];
67+
68+
kernel_neon_begin();
69+
__raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
70+
kernel_neon_end();
71+
}
72+
73+
static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
74+
void **ptrs)
75+
{
76+
u8 *p, *q, *dq;
77+
const u8 *qmul; /* Q multiplier table */
78+
79+
p = (u8 *)ptrs[disks - 2];
80+
q = (u8 *)ptrs[disks - 1];
81+
82+
/*
83+
* Compute syndrome with zero for the missing data page
84+
* Use the dead data page as temporary storage for delta q
85+
*/
86+
dq = (u8 *)ptrs[faila];
87+
ptrs[faila] = (void *)raid6_empty_zero_page;
88+
ptrs[disks - 1] = dq;
89+
90+
raid6_call.gen_syndrome(disks, bytes, ptrs);
91+
92+
/* Restore pointer table */
93+
ptrs[faila] = dq;
94+
ptrs[disks - 1] = q;
95+
96+
/* Now, pick the proper data tables */
97+
qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
98+
99+
kernel_neon_begin();
100+
__raid6_datap_recov_neon(bytes, p, q, dq, qmul);
101+
kernel_neon_end();
102+
}
103+
104+
const struct raid6_recov_calls raid6_recov_neon = {
105+
.data2 = raid6_2data_recov_neon,
106+
.datap = raid6_datap_recov_neon,
107+
.valid = raid6_has_neon,
108+
.name = "neon",
109+
.priority = 10,
110+
};

‎lib/raid6/recov_neon_inner.c

+117
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
/*
2+
* Copyright (C) 2012 Intel Corporation
3+
* Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
4+
*
5+
* This program is free software; you can redistribute it and/or
6+
* modify it under the terms of the GNU General Public License
7+
* as published by the Free Software Foundation; version 2
8+
* of the License.
9+
*/
10+
11+
#include <arm_neon.h>
12+
13+
static const uint8x16_t x0f = {
14+
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
15+
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
16+
};
17+
18+
#ifdef CONFIG_ARM
19+
/*
20+
* AArch32 does not provide this intrinsic natively because it does not
21+
* implement the underlying instruction. AArch32 only provides a 64-bit
22+
* wide vtbl.8 instruction, so use that instead.
23+
*/
24+
static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
25+
{
26+
union {
27+
uint8x16_t val;
28+
uint8x8x2_t pair;
29+
} __a = { a };
30+
31+
return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
32+
vtbl2_u8(__a.pair, vget_high_u8(b)));
33+
}
34+
#endif
35+
36+
void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
37+
uint8_t *dq, const uint8_t *pbmul,
38+
const uint8_t *qmul)
39+
{
40+
uint8x16_t pm0 = vld1q_u8(pbmul);
41+
uint8x16_t pm1 = vld1q_u8(pbmul + 16);
42+
uint8x16_t qm0 = vld1q_u8(qmul);
43+
uint8x16_t qm1 = vld1q_u8(qmul + 16);
44+
45+
/*
46+
* while ( bytes-- ) {
47+
* uint8_t px, qx, db;
48+
*
49+
* px = *p ^ *dp;
50+
* qx = qmul[*q ^ *dq];
51+
* *dq++ = db = pbmul[px] ^ qx;
52+
* *dp++ = db ^ px;
53+
* p++; q++;
54+
* }
55+
*/
56+
57+
while (bytes) {
58+
uint8x16_t vx, vy, px, qx, db;
59+
60+
px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
61+
vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
62+
63+
vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
64+
vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
65+
vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
66+
qx = veorq_u8(vx, vy);
67+
68+
vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4);
69+
vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
70+
vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f));
71+
vx = veorq_u8(vx, vy);
72+
db = veorq_u8(vx, qx);
73+
74+
vst1q_u8(dq, db);
75+
vst1q_u8(dp, veorq_u8(db, px));
76+
77+
bytes -= 16;
78+
p += 16;
79+
q += 16;
80+
dp += 16;
81+
dq += 16;
82+
}
83+
}
84+
85+
void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
86+
const uint8_t *qmul)
87+
{
88+
uint8x16_t qm0 = vld1q_u8(qmul);
89+
uint8x16_t qm1 = vld1q_u8(qmul + 16);
90+
91+
/*
92+
* while (bytes--) {
93+
* *p++ ^= *dq = qmul[*q ^ *dq];
94+
* q++; dq++;
95+
* }
96+
*/
97+
98+
while (bytes) {
99+
uint8x16_t vx, vy;
100+
101+
vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
102+
103+
vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
104+
vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
105+
vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
106+
vx = veorq_u8(vx, vy);
107+
vy = veorq_u8(vx, vld1q_u8(p));
108+
109+
vst1q_u8(dq, vx);
110+
vst1q_u8(p, vy);
111+
112+
bytes -= 16;
113+
p += 16;
114+
q += 16;
115+
dq += 16;
116+
}
117+
}

0 commit comments

Comments
 (0)
Please sign in to comment.