forked from alibaba/MNN
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
xiaying
committed
Jun 27, 2022
1 parent
043c586
commit 2d13d6a
Showing
23 changed files
with
13,557 additions
and
13,309 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
105 changes: 105 additions & 0 deletions
105
source/backend/cpu/arm/arm32/bf16/MNNReluWithSlopeChannelBF16.S
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
// | ||
// MNNReluWithSlopeChannelBF16.S | ||
// MNN | ||
// | ||
// Created by MNN on 2022/06/23. | ||
// Copyright © 2018, Alibaba Group Holding Limited | ||
// | ||
|
||
#ifdef __arm__ | ||
#ifndef __aarch64__ | ||
#include "MNNAsmGlobal.h" | ||
|
||
.text | ||
.align 5 | ||
|
||
asm_function MNNReluWithSlopeChannelBF16 | ||
//void MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad) | ||
|
||
//Auto Load: | ||
//r0:dst, r1:src, r2:slope, r3:sizeQuad | ||
|
||
//Load from sp | ||
//r4:depthQuad | ||
|
||
push {r4,r5, lr} | ||
ldr r4, [sp, #12] | ||
|
||
cmp r4, #0 | ||
beq PReluEnd | ||
cmp r3, #0 | ||
beq PReluEnd | ||
|
||
|
||
PReluZLoop: | ||
vld1.32 {d30}, [r2]! | ||
vshll.s16 q15, d30, #16 | ||
mov r5, r3 | ||
cmp r5, #3 | ||
ble PReluL1 | ||
|
||
PReluL4Loop: | ||
vld1.32 {q1}, [r1]! | ||
vshll.s16 q0, d2, #16 | ||
vshll.s16 q1, d3, #16 | ||
|
||
vcle.f32 q12, q0, #0 | ||
vcle.f32 q13, q1, #0 | ||
|
||
vld1.32 {q3}, [r1]! | ||
vshll.s16 q2, d6, #16 | ||
vshll.s16 q3, d7, #16 | ||
|
||
vmul.f32 q8, q0, q15 | ||
vmul.f32 q9, q1, q15 | ||
vbit.32 q0, q8, q12 | ||
vbit.32 q1, q9, q13 | ||
|
||
vmul.f32 q8, q2, q15 | ||
vmul.f32 q9, q3, q15 | ||
|
||
vshrn.i32 d0, q0, #16 | ||
vshrn.i32 d1, q1, #16 | ||
|
||
vst1.32 {q0}, [r0]! | ||
|
||
vcle.f32 q12, q2, #0 | ||
vcle.f32 q13, q3, #0 | ||
vbit.32 q2, q8, q12 | ||
vbit.32 q3, q9, q13 | ||
vshrn.i32 d4, q2, #16 | ||
vshrn.i32 d5, q3, #16 | ||
|
||
vst1.32 {q2}, [r0]! | ||
sub r5, r5, #4 | ||
cmp r5, #4 | ||
bge PReluL4Loop | ||
|
||
PReluL1: | ||
cmp r5, #0 | ||
|
||
beq PReluL1End | ||
|
||
PReluL1Loop: | ||
vld1.32 {d0}, [r1]! | ||
vshll.s16 q0, d0, #16 | ||
vcle.f32 q2, q0, #0 | ||
vmul.f32 q1, q0, q15 | ||
vbit.32 q0, q1, q2 | ||
vshrn.i32 d0, q0, #16 | ||
vst1.32 {q0}, [r0]! | ||
subs r5, r5, #1 | ||
bne PReluL1Loop | ||
|
||
PReluL1End: | ||
|
||
subs r4, r4, #1 | ||
bne PReluZLoop | ||
|
||
|
||
PReluEnd: | ||
|
||
pop {r4, r5, pc} | ||
|
||
#endif | ||
#endif |
97 changes: 97 additions & 0 deletions
97
source/backend/cpu/arm/arm64/bf16/MNNReluWithSlopeChannelBF16.S
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
// | ||
// MNNReluWithSlopeChannelBF16.S | ||
// MNN | ||
// | ||
// Created by MNN on 2022/06/23. | ||
// Copyright © 2018, Alibaba Group Holding Limited | ||
// | ||
|
||
#ifdef __aarch64__ | ||
#include "MNNAsmGlobal.h" | ||
|
||
.text | ||
.align 5 | ||
|
||
asm_function MNNReluWithSlopeChannelBF16 | ||
//void MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad) | ||
|
||
//Auto Load: | ||
//x0:dst, x1:src, x2:slope, x3:sizeQuad, x4:depthQuad | ||
|
||
|
||
cmp x4, #0 | ||
beq PReluEnd | ||
cmp x3, #0 | ||
beq PReluEnd | ||
|
||
|
||
PReluZLoop: | ||
ld1 {v23.4h}, [x2], #8 | ||
shll v23.4s, v23.4h, #16 | ||
mov x5, x3 | ||
cmp x5, #3 | ||
ble PReluL1 | ||
|
||
PReluL4Loop: | ||
ld1 {v0.4h, v1.4h}, [x1], #16 | ||
shll v0.4s, v0.4h, #16 | ||
shll v1.4s, v1.4h, #16 | ||
|
||
fcmle v20.4s, v0.4s, #0 | ||
fcmle v21.4s, v1.4s, #0 | ||
|
||
ld1 {v2.4h, v3.4h}, [x1], #16 | ||
shll v2.4s, v2.4h, #16 | ||
shll v3.4s, v3.4h, #16 | ||
|
||
fmul v16.4s, v0.4s, v23.4s | ||
fmul v17.4s, v1.4s, v23.4s | ||
bit v0.16b, v16.16b, v20.16b | ||
bit v1.16b, v17.16b, v21.16b | ||
|
||
fmul v16.4s, v2.4s, v23.4s | ||
fmul v17.4s, v3.4s, v23.4s | ||
shrn v0.4h, v0.4s, #16 | ||
shrn v1.4h, v1.4s, #16 | ||
|
||
|
||
st1 {v0.4h, v1.4h}, [x0], #16 | ||
|
||
fcmle v20.4s, v2.4s, #0 | ||
fcmle v21.4s, v3.4s, #0 | ||
bit v2.16b, v16.16b, v20.16b | ||
bit v3.16b, v17.16b, v21.16b | ||
shrn v2.4h, v2.4s, #16 | ||
shrn v3.4h, v3.4s, #16 | ||
|
||
st1 {v2.4h, v3.4h}, [x0], #16 | ||
sub x5, x5, #4 | ||
cmp x5, #4 | ||
bge PReluL4Loop | ||
|
||
PReluL1: | ||
cmp x5, #0 | ||
|
||
beq PReluL1End | ||
|
||
PReluL1Loop: | ||
ld1 {v0.4h}, [x1], #8 | ||
shll v0.4s, v0.4h, #16 | ||
fcmle v2.4s, v0.4s, #0 | ||
fmul v1.4s, v0.4s, v23.4s | ||
bit v0.16b, v1.16b, v2.16b | ||
shrn v0.4h, v0.4s, #16 | ||
st1 {v0.4h}, [x0], #8 | ||
subs x5, x5, #1 | ||
bne PReluL1Loop | ||
|
||
PReluL1End: | ||
|
||
subs x4, x4, #1 | ||
bne PReluZLoop | ||
|
||
|
||
PReluEnd: | ||
|
||
ret | ||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.