No commit message

mcx · Jun 27, 2022 · 2d13d6a · 2d13d6a
1 parent 043c586
commit 2d13d6a
Show file tree

Hide file tree

Showing 23 changed files with 13,557 additions and 13,309 deletions.
diff --git a/README.md b/README.md
@@ -110,7 +110,7 @@ Group #1 (Full): 23329087
 
 Group #2 (Full): 23350225
 
-Group #3: https://h5.dingtalk.com/circle/healthCheckin.html?dtaction=os&corpId=ding2c1d5c85a81030b9a483726330e8af54&574b2bb2-c53a-4=497bad6b-25a5-4&cbdbhh=qwertyuiop
+Group #3: https://h5.dingtalk.com/circle/healthCheckin.html?dtaction=os&corpId=ding8989a1d6ae6ef130b177420cc0e366ea&f0c81=1b93a&cbdbhh=qwertyuiop
 
 ## License
 Apache 2.0

diff --git a/README_CN.md b/README_CN.md
@@ -105,6 +105,14 @@ MNN 的输入（AI推理模型）是一个有向无环图（DAG），图中每
 - MNN-CV ：类似 OpenCV ，但核心计算功能基于 MNN 实现的图像处理算法库
 - MNN-Train ：MNN 训练模块，支持各平台训练
 
+## 社区交流与反馈
+钉钉群组：
+
+- 钉钉群1:23329087 
+- 钉钉群2:23350225
+- 钉钉群3:https://h5.dingtalk.com/circle/healthCheckin.html?dtaction=os&corpId=ding8989a1d6ae6ef130b177420cc0e366ea&f0c81=1b93a&cbdbhh=qwertyuiop
+
+
 
 ## License
 Apache 2.0

diff --git a/express/module/Module.cpp b/express/module/Module.cpp
@@ -297,6 +297,9 @@ static Module* loadInternal(const std::vector<std::string>& inputs, const std::v
 #endif // MNN_INTERNAL_ENABLED
 
     std::shared_ptr<Module::Info> info(new Module::Info);
+    if (net->extraInfo() && net->extraInfo()->version()) {
+        info->version = net->extraInfo()->version()->str();
+    }
     auto rtMgr = _rtMgr;
     Module::Config defaultConfig;
     if (nullptr == config) {
@@ -308,14 +311,14 @@ static Module* loadInternal(const std::vector<std::string>& inputs, const std::v
         sche_config.backendConfig = config->backend->config;
         rtMgr.reset(Executor::RuntimeManager::createRuntimeManager(sche_config));
     }
+    info->inputNames = inputs;
+    info->outputNames = outputs;
     if ((!inputs.empty()) && (!outputs.empty())) {
         _loadInputs(info.get(), inputs, net);
         info->runTimeManager = rtMgr;
         std::shared_ptr<Module> m(PipelineModule::load(inputs, outputs, buffer, length, rtMgr, config));
         return new NetModule(m, info);
     }
-    std::vector<std::string> newInputs = inputs;
-    std::vector<std::string> newOutputs = outputs;
     std::set<int> inputIdx, outputIdx, realInput, realOutput;
     for (int i=0; i< net->oplists()->size(); ++i) {
         auto op = net->oplists()->GetAs<Op>(i);
@@ -338,18 +341,18 @@ static Module* loadInternal(const std::vector<std::string>& inputs, const std::v
         }
     }
     std::set_difference(outputIdx.begin(), outputIdx.end(), inputIdx.begin(), inputIdx.end(), std::inserter(realOutput, realOutput.begin()));
-    if (newInputs.empty()) {
+    if (info->inputNames.empty()) {
         for (auto index : realInput) {
-            newInputs.emplace_back(net->tensorName()->GetAsString(index)->str());
+            info->inputNames.emplace_back(net->tensorName()->GetAsString(index)->str());
         }
     }
-    if (newOutputs.empty()) {
+    if (info->outputNames.empty()) {
         for (auto index : realOutput) {
-            newOutputs.emplace_back(net->tensorName()->GetAsString(index)->str());
+            info->outputNames.emplace_back(net->tensorName()->GetAsString(index)->str());
         }
     }
-    std::shared_ptr<Module> m(PipelineModule::load(newInputs, newOutputs, buffer, length, rtMgr, config));
-    _loadInputs(info.get(), newInputs, net);
+    std::shared_ptr<Module> m(PipelineModule::load(info->inputNames, info->outputNames, buffer, length, rtMgr, config));
+    _loadInputs(info.get(), info->inputNames, net);
     info->runTimeManager = rtMgr;
     return new NetModule(m, info);
 }

diff --git a/include/MNN/expr/Module.hpp b/include/MNN/expr/Module.hpp
@@ -81,10 +81,17 @@ class MNN_PUBLIC Module {
 
     struct Info {
         // Input info load from model
-        // If the ith input has no info, it will be nullptr
         std::vector<Variable::Info> inputs;
+        // The Module's defaultFormat, NCHW or NHWC
         Dimensionformat defaultFormat;
+        // Runtime Info
         std::shared_ptr<MNN::Express::Executor::RuntimeManager> runTimeManager;
+        // Input Names By Order
+        std::vector<std::string> inputNames;
+        // Output Names By Order
+        std::vector<std::string> outputNames;
+        // The MNNConvert's Version build the module
+        std::string version;
     };
     const Info* getInfo() const;
     class CloneContext {

diff --git a/project/android/build_32.sh b/project/android/build_32.sh
@@ -9,7 +9,6 @@ cmake ../../../ \
 -DANDROID_TOOLCHAIN=clang \
 -DMNN_USE_LOGCAT=false \
 -DMNN_USE_SSE=OFF \
--DMNN_SUPPORT_BF16=OFF \
 -DMNN_BUILD_TEST=ON \
 -DMNN_BUILD_FOR_ANDROID_COMMAND=true \
 -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. $1 $2 $3

diff --git a/source/backend/cpu/arm/arm32/bf16/MNNReluWithSlopeChannelBF16.S b/source/backend/cpu/arm/arm32/bf16/MNNReluWithSlopeChannelBF16.S
@@ -0,0 +1,105 @@
+//
+//  MNNReluWithSlopeChannelBF16.S
+//  MNN
+//
+//  Created by MNN on 2022/06/23.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNReluWithSlopeChannelBF16
+//void MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad)
+
+//Auto Load:
+//r0:dst, r1:src, r2:slope, r3:sizeQuad
+
+//Load from sp
+//r4:depthQuad
+
+push {r4,r5, lr}
+ldr r4, [sp, #12]
+
+cmp r4, #0
+beq PReluEnd
+cmp r3, #0
+beq PReluEnd
+
+
+PReluZLoop:
+vld1.32 {d30}, [r2]!
+vshll.s16 q15, d30, #16
+mov r5, r3
+cmp r5, #3
+ble PReluL1
+
+PReluL4Loop:
+vld1.32 {q1}, [r1]!
+vshll.s16 q0, d2, #16
+vshll.s16 q1, d3, #16
+
+vcle.f32 q12, q0, #0
+vcle.f32 q13, q1, #0
+
+vld1.32 {q3}, [r1]!
+vshll.s16 q2, d6, #16
+vshll.s16 q3, d7, #16
+
+vmul.f32 q8, q0, q15
+vmul.f32 q9, q1, q15
+vbit.32 q0, q8, q12
+vbit.32 q1, q9, q13
+
+vmul.f32 q8, q2, q15
+vmul.f32 q9, q3, q15
+
+vshrn.i32 d0, q0, #16
+vshrn.i32 d1, q1, #16
+
+vst1.32 {q0}, [r0]!
+
+vcle.f32 q12, q2, #0
+vcle.f32 q13, q3, #0
+vbit.32 q2, q8, q12
+vbit.32 q3, q9, q13
+vshrn.i32 d4, q2, #16
+vshrn.i32 d5, q3, #16
+
+vst1.32 {q2}, [r0]!
+sub r5, r5, #4
+cmp r5, #4
+bge PReluL4Loop
+
+PReluL1:
+cmp r5, #0
+
+beq PReluL1End
+
+PReluL1Loop:
+vld1.32 {d0}, [r1]!
+vshll.s16 q0, d0, #16
+vcle.f32 q2, q0, #0
+vmul.f32 q1, q0, q15
+vbit.32 q0, q1, q2
+vshrn.i32 d0, q0, #16
+vst1.32 {q0}, [r0]!
+subs r5, r5, #1
+bne PReluL1Loop
+
+PReluL1End:
+
+subs r4, r4, #1
+bne PReluZLoop
+
+
+PReluEnd:
+
+pop {r4, r5, pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm64/bf16/MNNReluWithSlopeChannelBF16.S b/source/backend/cpu/arm/arm64/bf16/MNNReluWithSlopeChannelBF16.S
@@ -0,0 +1,97 @@
+//
+//  MNNReluWithSlopeChannelBF16.S
+//  MNN
+//
+//  Created by MNN on 2022/06/23.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __aarch64__
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNReluWithSlopeChannelBF16
+//void MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad)
+
+//Auto Load:
+//x0:dst, x1:src, x2:slope, x3:sizeQuad, x4:depthQuad
+
+
+cmp x4, #0
+beq PReluEnd
+cmp x3, #0
+beq PReluEnd
+
+
+PReluZLoop:
+ld1 {v23.4h}, [x2], #8
+shll v23.4s, v23.4h, #16
+mov x5, x3
+cmp x5, #3
+ble PReluL1
+
+PReluL4Loop:
+ld1 {v0.4h, v1.4h}, [x1], #16
+shll v0.4s, v0.4h, #16
+shll v1.4s, v1.4h, #16
+
+fcmle v20.4s, v0.4s, #0
+fcmle v21.4s, v1.4s, #0
+
+ld1 {v2.4h, v3.4h}, [x1], #16
+shll v2.4s, v2.4h, #16
+shll v3.4s, v3.4h, #16
+
+fmul v16.4s, v0.4s, v23.4s
+fmul v17.4s, v1.4s, v23.4s
+bit v0.16b, v16.16b, v20.16b
+bit v1.16b, v17.16b, v21.16b
+
+fmul v16.4s, v2.4s, v23.4s
+fmul v17.4s, v3.4s, v23.4s
+shrn v0.4h, v0.4s, #16
+shrn v1.4h, v1.4s, #16
+
+
+st1 {v0.4h, v1.4h}, [x0], #16
+
+fcmle v20.4s, v2.4s, #0
+fcmle v21.4s, v3.4s, #0
+bit v2.16b, v16.16b, v20.16b
+bit v3.16b, v17.16b, v21.16b
+shrn v2.4h, v2.4s, #16
+shrn v3.4h, v3.4s, #16
+
+st1 {v2.4h, v3.4h}, [x0], #16
+sub x5, x5, #4
+cmp x5, #4
+bge PReluL4Loop
+
+PReluL1:
+cmp x5, #0
+
+beq PReluL1End
+
+PReluL1Loop:
+ld1 {v0.4h}, [x1], #8
+shll v0.4s, v0.4h, #16
+fcmle v2.4s, v0.4s, #0
+fmul v1.4s, v0.4s, v23.4s
+bit v0.16b, v1.16b, v2.16b
+shrn v0.4h, v0.4s, #16
+st1 {v0.4h}, [x0], #8
+subs x5, x5, #1
+bne PReluL1Loop
+
+PReluL1End:
+
+subs x4, x4, #1
+bne PReluZLoop
+
+
+PReluEnd:
+
+ret
+#endif
diff --git a/source/backend/cpu/bf16/BF16Functions.cpp b/source/backend/cpu/bf16/BF16Functions.cpp
@@ -20,6 +20,9 @@
 #include "BF16Unary.hpp"
 using BFVec4 = MNN::Math::VecHalf<4>;
 using Vec4 = MNN::Math::Vec<float, 4>;
+extern "C" {
+void MNNReluWithSlopeChannelBF16(float* dstO, const float* srcO, const float* slopeO, size_t sizeQuad, size_t depthQuad);
+}
 namespace MNN {
 // just for reference BF16 converting of c++ code, not for arm or sse.
 inline int16_t MNNFP32ToBF16(float fp32Value) {
@@ -139,6 +142,7 @@ void MNNAxByClampBroadcastUnitBF16(float* CF, const float* AF, const float* BF,
         }
     }
 }
+#ifndef MNN_USE_NEON
 void MNNReluWithSlopeChannelBF16(float* dstO, const float* srcO, const float* slopeO, size_t sizeQuad, size_t depthQuad) {
     auto slope = (const int16_t*)slopeO;
     auto dst = (int16_t*)dstO;
@@ -163,7 +167,7 @@ void MNNReluWithSlopeChannelBF16(float* dstO, const float* srcO, const float* sl
         }
     }
 }
-
+#endif
 
 #if !defined(MNN_USE_SSE) && !defined(MNN_USE_NEON)
 void MNNPackC4ForMatMul_A_BF16(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el) {