[MNN:Sync] Sync Internal 2.8.0

mcx · Dec 4, 2023 · 387775b · 387775b
1 parent 8d5d8b8
commit 387775b
Showing 347 changed files with 34,680 additions and 12,043 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -43,6 +43,7 @@ option(MNN_SUPPORT_DEPRECATED_OP "Enable MNN's tflite quantized op" ON)
 option(MNN_DEBUG_MEMORY "MNN Debug Memory Access" OFF)
 option(MNN_DEBUG_TENSOR_SIZE "Enable Tensor Size" OFF)
 option(MNN_GPU_TRACE "Enable MNN Gpu Debug" OFF)
+option(MNN_SUPPORT_RENDER "Enable MNN Render Ops" OFF)
 option(MNN_PORTABLE_BUILD "Link the static version of third party libraries where possible to improve the portability of built executables" OFF)
 option(MNN_SEP_BUILD "Build MNN Backends and expression separately. Only works with MNN_BUILD_SHARED_LIBS=ON" ON)
 option(NATIVE_LIBRARY_OUTPUT "Native Library Path" OFF)
@@ -162,6 +163,9 @@ endif()
 if(MNN_SUPPORT_DEPRECATED_OP)
     add_definitions(-DMNN_SUPPORT_DEPRECATED_OP)
 endif()
+if(MNN_SUPPORT_RENDER)
+    add_definitions(-DMNN_SUPPORT_RENDER)
+endif()
 
 # debug options
 if(MNN_DEBUG_MEMORY)
@@ -372,7 +376,7 @@ list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNMath>)
 list(APPEND MNN_TARGETS MNNMath)
 
 # Transform
-FILE(GLOB MNN_Transform_SRC ${CMAKE_CURRENT_LIST_DIR}/source/shape/* ${CMAKE_CURRENT_LIST_DIR}/source/geometry/*)
+FILE(GLOB_RECURSE MNN_Transform_SRC ${CMAKE_CURRENT_LIST_DIR}/source/shape/* ${CMAKE_CURRENT_LIST_DIR}/source/geometry/*)
 add_library(MNNTransform OBJECT ${MNN_Transform_SRC})
 IF (NOT MNN_BUILD_MINI)
     list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNTransform>)
@@ -601,9 +605,12 @@ IF(MNN_BUILD_TRAIN OR MNN_BUILD_QUANTOOLS)
   add_subdirectory(tools/train)
   IF(MNN_SEP_BUILD)
     list(APPEND MNN_DEPS MNNTrain)
+    list(APPEND MNN_DEPS MNNTrainUtils)
   ELSE()
     list(APPEND MNN_TARGETS MNNTrain)
+    list(APPEND MNN_TARGETS MNNTrainUtils)
     list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNTrain>)
+    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNTrainUtils>)
   ENDIF()
 ENDIF()
 

diff --git a/docs/compile/cmake.md b/docs/compile/cmake.md
@@ -63,11 +63,8 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_VULKAN_DEBUG     | 是否打开Vulkan的DEBUG模式，该宏仅在`MNN_VULKAN=ON`时生效，默认为`OFF` |
 | MNN_OPENGL_REGEN     | 是否重新生成OpenGL Kenel，该宏仅在`MNN_OPENGL=ON`时生效，默认为`OFF` |
 | MNN_TRT_DYNAMIC      | 是否通过dlopen的方式引入TRT的动态库，该宏仅在`MNN_TENSORRT=ON`时生效，默认为`OFF |
-| TF_CONVERT_ORIGIN    | 构建的`MNNConvert`是否使用原始TF转换模式，该宏仅在`MNN_BUILD_CONVERTER=ON`时生效，默认为`OFF` |
-| TFMODEL_OPTIMIZE     | 构建的`MNNConvert`是否对Tensorflow模型执行优化，该宏仅在`MNN_BUILD_CONVERTER=ON`时生效，默认为`OFF` |
 | MNN_BUILD_TORCH      | 构建的`MNNConvert`是否支持`TorchScript`，该宏仅在`MNN_BUILD_CONVERTER=ON`时生效，默认为`OFF` |
 | MNN_TRAIN_DEBUG      | 构建的训练模块是否支持调试，该宏仅在`MNN_BUILD_TRAIN=ON`时生效，默认为`OFF` |
-| MNN_BUILD_TRAIN_MINI | 构建删减版训练模块，不构建`Dataset`与`model`，该宏仅在`MNN_BUILD_TRAIN=ON`时生效，默认为`OFF` |
 | MNN_USE_OPENCV       | 构建的训练Demo是否使用`OpenCV`依赖，该宏仅在`MNN_BUILD_TRAIN=ON`时生效，默认为`OFF` |
 | MNN_IMGPROC_COLOR    | 构建MNN的OpenCV功能是否开启`颜色空间转换`，默认为`ON` |
 | MNN_IMGPROC_GEOMETRIC | 构建MNN的OpenCV功能是否开启`形变`，默认为`ON` |
@@ -83,4 +80,5 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_OPENCV_BENCH     | 构建MNN的OpenCV功能是否开启性能benchmark，默认为`OFF` |
 | MNN_VULKAN_IMAGE     | 构建MNN的Vulkan后端时采用Image内存模式，以便支持FP16和部分移动端上GPU的加速，默认为`ON` |
 | MNN_LOW_MEMORY       | 是否支持低内存模式，支持低内存模式使用权值量化模型并设置`low_memory`则会使用计算时反量化，默认为`OFF` |
+| MNN_SUPPORT_RENDER       | 是否支持图形渲染相关算子实现，默认为 `OFF` |
 | MNN_BUILD_LLM        | 是否构建基于MNN的llm库和demo，默认为`OFF` |
diff --git a/docs/compile/pymnn.md b/docs/compile/pymnn.md
@@ -2,8 +2,8 @@
 ## 本地安装
 ```bash
 cd /path/to/MNN/pymnn/pip_package
-python build_deps.py
-python setup.py install --version {MNN版本}
+python build_deps.py {MNN依赖包组合} #internal,cuda,trt,cuda_tune,opencl,vulkan,render,no_sse,torch这几个字符串的任意组合，例如字符串可为:"cuda,reder,no_sse"
+python setup.py install --version {MNN版本} --deps {MNN依赖包组合}
 ```
 ## 构建Python Wheel包
 - Linux
@@ -41,4 +41,4 @@ python setup.py install --version {MNN版本}
     .\package_scripts\win\build_whl.ps1 -version {MNN版本} -backends "opencl,vulkan" -path MNN-CPU-OPENCL/py_whl/x64 -pyenvs "py27,py37,py38,py39"
     # CPU+OpenCL+Vulkan，32位编译
     .\package_scripts\win\build_whl.ps1 -version {MNN版本} -backends "opencl,vulkan" -x86 -path MNN-CPU-OPENCL/py_whl/x86 -pyenvs "py27-win32,py37-win32,py38-win32,py39-win32"
-    ```
+    ```
diff --git a/docs/compile/tools.md b/docs/compile/tools.md
@@ -29,10 +29,8 @@
 - 编译产物
   - `MNNTrain` 训练框架库
   - `runTrainDemo.out` 运行训练框架demo的入口程序
-  - `transformer.out` 训练模型转换器
-  - `train.out` 训练功能入口程序
-  - `rawDataTransform.out` 将json文件转换为flatbuffers文件
-  - `dataTransformer.out` 将图片转换为flatbuffers文件
+  - `transformer` 训练模型转换器，将推理用的MNN模型转换为执行训练的MNN模型
+  - `extractForInfer` 从执行训练的MNN模型中提取参数，对应更新推理用的MNN模型
 ## 测试工具
 - 相关编译选项
   - `MNN_BUILD_TOOL` 是否编译测试工具

diff --git a/docs/inference/module.md b/docs/inference/module.md
@@ -56,6 +56,41 @@ std::unique_ptr<Module> module; // module
 module.reset(Module::load(input_names, output_names, model_filename.c_str(), rtMgr, &mdconfig));
 ```
 
+### Module::Config 
+创建`Module`时可传入`Module::Config`，具体结构如下：
+
+```cpp
+struct Config {
+    // Load module as dynamic, default static
+    bool dynamic = false;
+
+    // for static mode, if the shape is mutable, set true, otherwise set false to avoid resizeSession freqencily
+    bool shapeMutable = true;
+    // Pre-rearrange weights or not. Disabled by default.
+    // The weights will be rearranged in a general way, so the best implementation
+    // may not be adopted if `rearrange` is enabled.
+    bool rearrange = false;
+
+    BackendInfo* backend = nullptr;
+};
+```
+
+#### dynamic
+- 默认为 false ，输出的变量为const ，只能得到数据
+- 若 dynamic = true ，加载出的模型将按动态图方式运行，会增加额外构图耗时，但可以保存输出变量的计算路径，存成模型
+- 若 dynamic = true ，后面的 shapeMutable / rearrange 不再生效
+
+#### shapeMutable
+- 默认为 true ，表示输入形状易变，将延迟进行形状相关计算
+- 设置为 false 时，会提前申请内存，在 onForward 时做输入数据的拷贝而不是直接使用指针
+
+#### rearrange
+- 若为 true ，在创建 Module 时会预先创建卷积算子，做权重重排，以降低运行时的内存
+- 目前只支持 CPU 和 CUDA 后端
+
+#### backend
+已经废弃，不要设置此项
+
 ### 获取模型信息
 调用`getInfo`函数可获取`Module`信息，可以参考代码：`tools/cpp/GetMNNInfo.cpp`，[工具](../tools/test.html#getmnninfo)
 ```cpp

diff --git a/docs/pymnn/expr.md b/docs/pymnn/expr.md
@@ -145,6 +145,52 @@ array([0., 1., 2., 3.], dtype=float32)
 'Input'
 ```
 ---
+### `set_lazy_mode(mode)`
+设置惰性计算的模式，仅在开启惰性求值的状态下生效，
+
+- 0 : 所有计算均延迟执行
+- 1 : 立即进行几何计算，内容计算延迟执行，适用于构建静态模型或训练时求导
+
+默认为0
+
+
+参数：
+- `x:int` 模式类型
+
+返回：`None`
+
+返回类型：`None`
+
+示例：
+```python
+>>> expr.lazy_eval(True)
+>>> expr.set_lazy_mode(0)
+>>> y = expr.concat([x], -1)
+>>> expr.save([y], "concat.mnn") # 模型中为 concat 算子
+>>> expr.set_lazy_mode(1)
+>>> y = expr.concat([x], -1)
+>>> expr.save([y], "concat_static.mnn") # 模型中为 raster 算子
+```
+
+---
+### `set_global_executor_config(backend, precision, threadnum)`
+设置expr运行后端、精度、线程数(gpu代表mode)：
+
+参数：
+- `backend:int` 例如：0->CPU 1->Metal 2->CUDA 3->OPENCL 
+- `precision:int` 例如：0—>Normal 1->High 2->Low 
+- `threadnum:int` 例如：CPU表示线程数  GPU表示Mode
+
+返回：`None`
+
+返回类型：`None`
+
+示例：
+
+```python
+>>> expr.set_global_executor_config(2, 2, 1)
+```
+---
 ### `sign(x)`
 返回输入值的符号，正数返回1，负数返回-1
 
@@ -3054,4 +3100,4 @@ dict_keys(['conv1', 'conv2_1/dw', 'conv2_1/sep', 'conv2_2/dw', 'conv2_2/sep', 'c
 dict_keys(['data'])
 >>> outputs.keys()
 dict_keys(['prob'])
-```
+```
diff --git a/docs/tools/test.md b/docs/tools/test.md
@@ -87,11 +87,12 @@ Avg= 5.570600 ms, OpSum = 7.059200 ms min= 3.863000 ms, max= 11.596001 ms
 - 16 : 适用于使用 GPU 的情况，由 MNN 优先选择 CPU 运行，并将 GPU 的 tuning 信息存到 cache 文件，所有算子 tuning 完成则启用 GPU
 - 32 : rearrange 设为 true ，降低模型加载后的内存大小，但会增加模型加载的初始化时间
 - 64 : 创建模型后，clone 出一个新的模型运行，用于测试 clone 功能（主要用于多并发推理）的正确性
+- 128 : 使用文件夹下面的 input.mnn 和 output.mnn 做为输入和对比输出，对于数据量较大的情况宜用此方案
 
 
 ### 示例
 ```bash
-$ python ../tools/script/fastTestOnnx.py mobilenetv2-7.onnx
+$ python ../tools/script/testMNNFromOnnx.py mobilenetv2-7.onnx
 $ ./ModuleBasic.out mobilenetv2-7.mnn onnx 0 0 10   
 Test mobilenetv2-7.mnn from input info: onnx
 input
@@ -114,7 +115,7 @@ Avg= 9.946699 ms, min= 9.472000 ms, max= 10.227000 ms
 - `model:str` 模型文件路径
 - `forwardType:int` 执行推理的计算设备，有效值为：0（CPU）、1（Metal）、2（CUDA）、3（OpenCL）、6（OpenGL），7(Vulkan) ，9 (TensorRT)
 - `shapeMutable:int` 输入形状是否可变
-- `dir_n:str` 输入输出信息文件夹，可使用 fastTestOnnx.py / fastTestTf.py / fastTestTflite.py 等脚本生成，参考模型转换的正确性校验部分
+- `dir_n:str` 输入输出信息文件夹，可使用 testMNNFromOnnx.py 等脚本生成，参考模型转换的正确性校验部分
 ```bash
 ./SequenceModuleTest.out transformer.mnn 0 1 tr tr1 tr2 tr3 tr4 > error.txt
 ```

diff --git a/express/Executor.cpp b/express/Executor.cpp
@@ -145,6 +145,7 @@ std::shared_ptr<Executor> Executor::getGlobalExecutor() {
         info.type = MNN_FORWARD_CPU;
         info.numThread = 1;
         std::shared_ptr<Runtime> bn(creator->onCreate(info));
+        bn->setAllocatorType(info.allocator);
         gExecutor = new std::shared_ptr<Executor>(new Executor(bn, MNN_FORWARD_CPU, 1));
     });
     return *gExecutor;
@@ -668,10 +669,9 @@ std::shared_ptr<Executor::SubGraph> Executor::findSubGraph(const std::string& su
     }
     return iter->second;
 }
-void Executor::setLazyComputeMode(LazyMode mode) {
+void Executor::setLazyComputeMode(uint32_t mode) {
     mLazyMode = mode;
 }
 
-
 } // namespace Express
 } // namespace MNN
diff --git a/express/Expr.cpp b/express/Expr.cpp
@@ -193,8 +193,11 @@ EXPRP Expr::create(std::shared_ptr<BufferStorage> extra, std::vector<VARP>&& inp
     expr->mStorage = extra;
     expr->mOp = flatbuffers::GetRoot<Op>(extra->buffer());
     expr->mInputs   = std::move(inputs);
-    expr->mInside->mReq = ExecutorScope::Current()->getRequirement(expr.get());
-    _addLinkForInputs(expr);
+    auto exe = ExecutorScope::Current();
+    expr->mInside->mReq = exe->getRequirement(expr.get());
+    if (!(exe->getLazyMode() & Executor::LAZY_COMPUTE_ONCE)) {
+        _addLinkForInputs(expr);
+    }
     return expr;
 }
 
@@ -350,7 +353,7 @@ VARP Variable::create(EXPRP expr, int index) {
     }
     // CONTENT Mode
     do {
-        if (executor->getLazyMode() != Executor::LAZY_CONTENT) {
+        if (!(executor->getLazyMode() & Executor::LAZY_CONTENT)) {
             break;
         }
         if (expr->get() == nullptr) {
@@ -1016,7 +1019,6 @@ blob->dataType = DataType_DT_##TYPE;
 
 void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
     auto executeOrder = getExecuteOrder(vars);
-
     // Search subgraphs
     std::map<std::string, std::shared_ptr<Executor::SubGraph>> subgraphs;
     auto exe = ExecutorScope::Current();
@@ -1086,15 +1088,9 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
                 blob->dataFormat = (MNN_DATA_FORMAT)Utils::convertFormat(info.order);
                 blob->dims       = info.dim;
                 if (info.type.code == halide_type_float) {
-                    if (info.type.bits == 16) {
-                        blob->dataType = DataType_DT_BFLOAT16;
-                        blob->uint8s.resize(info.size * 2);
-                        ::memcpy(blob->uint8s.data(), ptr, info.size * sizeof(int16_t));
-                    } else {
-                        blob->dataType = DataType_DT_FLOAT;
-                        blob->float32s.resize(info.size);
-                        ::memcpy(blob->float32s.data(), ptr, info.size * sizeof(float));
-                    }
+                    blob->dataType = DataType_DT_FLOAT;
+                    blob->float32s.resize(info.size);
+                    ::memcpy(blob->float32s.data(), ptr, info.size * sizeof(float));
                 } else if (info.type.code == halide_type_int && info.type.bits == 32) {
                     blob->dataType = DataType_DT_INT32;
                     blob->int32s.resize(info.size);
@@ -1107,6 +1103,10 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
                     blob->dataType = DataType_DT_UINT8;
                     blob->uint8s.resize(info.size);
                     ::memcpy(blob->uint8s.data(), ptr, info.size * sizeof(uint8_t));
+                } else if (info.type.code == halide_type_bfloat && info.type.bits == 16) {
+                    blob->dataType = DataType_DT_BFLOAT16;
+                    blob->uint8s.resize(info.size * 2);
+                    ::memcpy(blob->uint8s.data(), ptr, info.size * sizeof(int16_t));
                 }
                 op->type       = OpType_Const;
                 if (expr->mType == VARP::TRAINABLE) {
@@ -1163,12 +1163,14 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
                     dest->tensorName[subindex] = op->name + numberToString(v);
                 }
             }
-            if (staticModel) {
-                auto tensor = expr->inside()->mOutputTensors[v];
+            auto tensor = expr->inside()->mOutputTensors[v];
+
+            if (staticModel || TensorUtils::getDescribe(tensor)->quantAttr) {
                 auto des = TensorUtils::getDescribe(tensor);
                 auto describe = std::unique_ptr<MNN::TensorDescribeT>(new MNN::TensorDescribeT);
                 describe->index = varIndexInfo[expr] + v;
                 describe->blob = std::unique_ptr<MNN::BlobT>(new MNN::BlobT);
+                describe->name = dest->tensorName[subindex];
                 auto& blob = describe->blob;
                 blob->dataFormat = des->dimensionFormat;
                 if (tensor->getType() == halide_type_of<float>()) {
@@ -1190,18 +1192,20 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
                     describe->quantInfo->zero = tensorDes->quantAttr->zero;
                     describe->quantInfo->scale = tensorDes->quantAttr->scale;
                 }
-                for (auto& reg : des->regions) {
-                    auto regionT = std::unique_ptr<MNN::RegionT>(new MNN::RegionT);
-                    regionT->src = std::unique_ptr<MNN::ViewT>(new MNN::ViewT);
-                    regionT->dst = std::unique_ptr<MNN::ViewT>(new MNN::ViewT);
-                    regionT->src->offset = reg.src.offset;
-                    regionT->dst->offset = reg.dst.offset;
-                    for (int s = 0; s < 3; s++) {
-                        regionT->src->stride.push_back(reg.src.stride[s]);
-                        regionT->dst->stride.push_back(reg.dst.stride[s]);
-                        regionT->size.push_back(reg.size[s]);
+                if (staticModel) {
+                    for (auto& reg : des->regions) {
+                        auto regionT = std::unique_ptr<MNN::RegionT>(new MNN::RegionT);
+                        regionT->src = std::unique_ptr<MNN::ViewT>(new MNN::ViewT);
+                        regionT->dst = std::unique_ptr<MNN::ViewT>(new MNN::ViewT);
+                        regionT->src->offset = reg.src.offset;
+                        regionT->dst->offset = reg.dst.offset;
+                        for (int s = 0; s < 3; s++) {
+                            regionT->src->stride.push_back(reg.src.stride[s]);
+                            regionT->dst->stride.push_back(reg.dst.stride[s]);
+                            regionT->size.push_back(reg.size[s]);
+                        }
+                        describe->regions.emplace_back(std::move(regionT));
                     }
-                    describe->regions.emplace_back(std::move(regionT));
                 }
                 dest->extraTensorDescribe.emplace_back(std::move(describe));
             }

diff --git a/express/NeuralNetWorkOp.cpp b/express/NeuralNetWorkOp.cpp
@@ -1327,7 +1327,6 @@ VARP _Range(VARP start, VARP limit, VARP delta) {
     std::unique_ptr<OpT> op(new OpT);
     op->type       = OpType_Range;
     auto rangeParam = new RangeT;
-    rangeParam->Tidx = (MNN::DataType)Utils::convertDataType(start->getInfo()->type);
     op->main.type = OpParameter_Range;
     op->main.value = rangeParam;
     return Variable::create(Expr::create(std::move(op), {start, limit, delta}));

diff --git a/express/Utils.cpp b/express/Utils.cpp
@@ -81,7 +81,7 @@ halide_type_t Utils::revertDataType(DataType dataType) {
     CONVERT(DataType_DT_UINT8, halide_type_of<uint8_t>(), dataType);
     CONVERT(DataType_DT_INT8, halide_type_of<int8_t>(), dataType);
     CONVERT(DataType_DT_HALF, halide_type_of<float>(), dataType);
-    CONVERT(DataType_DT_BFLOAT16, halide_type_t(halide_type_float, 16), dataType);
+    CONVERT(DataType_DT_BFLOAT16, halide_type_t(halide_type_bfloat, 16), dataType);
     return halide_type_of<float>();
 }
 Express::Dimensionformat Utils::revertFormat(int format) {

diff --git a/express/module/PipelineModule.cpp b/express/module/PipelineModule.cpp
@@ -518,7 +518,7 @@ static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, co
     scheduleInfo.defaultBackend = sharedConst->defaultBackend;
     scheduleInfo.constReplaceBackend = sharedConst->constReplaceBackend;
     scheduleInfo.allTensors = sharedConst->allTensors;
-    initTensors(scheduleInfo.allTensors, net);
+    scheduleInfo.validForResize = initTensors(scheduleInfo.allTensors, net);
     std::vector<Schedule::OpCacheInfo> oplists;
     std::vector<const Op*> ops;
     ops.reserve(info.opList.size());

diff --git a/express/module/StaticModule.cpp b/express/module/StaticModule.cpp
@@ -367,7 +367,11 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
         if (mResource->mUseContentInputs) {
             mSession->setNeedResize();
         }
-        mSession->resize();
+        auto code = mSession->resize();
+        if (NO_ERROR != code) {
+            FUNC_PRINT(code);
+            return {};
+        }
     } else {
         // Resize
         for (int i = 0; i < inputs.size(); ++i) {

diff --git a/include/MNN/HalideRuntime.h b/include/MNN/HalideRuntime.h
@@ -60,8 +60,9 @@ typedef enum halide_type_code_t
 {
     halide_type_int = 0,   //!< signed integers
     halide_type_uint = 1,  //!< unsigned integers
-    halide_type_float = 2, //!< floating point numbers
-    halide_type_handle = 3 //!< opaque pointer type (void *)
+    halide_type_float = 2, //!< IEEE floating point numbers
+    halide_type_handle = 3, //!< opaque pointer type (void *)
+    halide_type_bfloat = 4  //!< floating point numbers in the bfloat format
 } halide_type_code_t;
 
 // Note that while __attribute__ can go before or after the declaration,