feat(docs): add nms, nonzero add op design docs (Cambricon#9)

* feat(docs): add ops docs. * feat(bangpy-ops): update add operator computation and the design document. * fix(docs):add nms design document * feat(docs):add nonzero design docs. * fix(docs):fix design docs links. * fix(docs):rm host.h * fix(docs):add nms design document. * fix(docs):add nms design document. * fix(docs):rename designer name. * feat(docs): update file link path. * feat(docs): update file link path. Co-authored-by: zhourihua <[email protected]> Co-authored-by: xumingyang <[email protected]> Co-authored-by: Ai Youngcino <[email protected]>
Shin-Wang · Dec 30, 2021 · a2b1646 · a2b1646
1 parent b5aee29
commit a2b1646
Show file tree

Hide file tree

Showing 13 changed files with 733 additions and 68 deletions.
diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@ mlu-ops 提供基于寒武纪机器学习单元（Machine Learning Unit，MLU）
 
   ```sh
   wget https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tgz
-  tar -xvf /tmp/Python-3.8.0.tgz
+  tar -xvf Python-3.8.0.tgz
   cd Python-3.8.0
   make -j24 && make install
   ```

diff --git a/bangpy-ops/README.md b/bangpy-ops/README.md
@@ -31,7 +31,7 @@
 
   `注意` : 如果将指定算子列表存放在文件中，其格式是每行一个算子。
 
-  编译成功后在 `bangpy-ops/outs` 目录下生成与算子同名的输出文件夹，其中包含 `libmluops.so`，`host.h` 等文件。
+  编译成功后在 `bangpy-ops/outs` 目录下生成与算子同名的输出文件夹，其中包含 `libmluops.so`等文件。
 
 
 ## 运行测试用例

diff --git a/bangpy-ops/ops/add/add.py b/bangpy-ops/ops/add/add.py
@@ -28,9 +28,8 @@
 from bangpy.platform.bang_config import ALIGN_LENGTH, TARGET
 from bangpy.tcp.runtime import TaskType
 
-DTYPES = [bangpy.float16]
+DTYPES = [bangpy.float16, bangpy.float32]
 TARGET_LIST = ["mlu370-s4", "mlu220-m2", "mlu270", "mlu290"]
-SHAPE = (64000,)
 KERNEL_NAME = "add"
 
 
@@ -39,42 +38,34 @@ class Add(object):
     Add the data in the two buffers.
     """
 
-    def __init__(self, shape, dtype, target, task_num):
-        self.shape = shape
+    def __init__(self, dtype, target, task_num):
         self.dtype = dtype
         self.target = target
         self.task_num = task_num
-        self.length = np.prod(shape)
+        self.bp = tcp.TCP(target)
+        self.length = self.bp.SizeVar("length")
         self.nram_size = TARGET(target).nram_size
         self.dtype_sz = dtype.bytes
-        self.bp = tcp.TCP(target)
+        self.single_buffer_size = 1024
         self.bp.launch_task(self.task_num, 1, 1)
 
     def compute_body(self):
         # calculate split strategy
-        # ensure the data size can be divisible by task_num and 128 bytes aligned
-        assert (self.dtype_sz * self.length) % self.task_num % ALIGN_LENGTH == 0
         # gets the data length to be calculated for each task
         data_calculated_each_task = self.length // self.task_num
-        loop_num = np.ceil(
-            3 * data_calculated_each_task * self.dtype_sz / self.nram_size
-        )
-        # ensure the data size is 128 bytes aligned for each calculation
-        while (
-            data_calculated_each_task % loop_num != 0
-            or data_calculated_each_task // loop_num % ALIGN_LENGTH != 0
-        ):
-            loop_num += 1
-        data_calculated_each_time = int(data_calculated_each_task // loop_num)
+        # gets the number of cycles required for each task
+        loop_num = data_calculated_each_task * self.dtype_sz // self.single_buffer_size
+        # gets the data length for each calculation
+        data_calculated_each_time = self.single_buffer_size // self.dtype_sz
         # declare I/O buffer
         buffer_in0 = self.bp.Buffer(
-            shape=self.shape, name="INPUT0", dtype=self.dtype, scope="global"
+            shape=(self.length,), name="INPUT0", dtype=self.dtype, scope="global"
         )
         buffer_in1 = self.bp.Buffer(
-            shape=self.shape, name="INPUT1", dtype=self.dtype, scope="global"
+            shape=(self.length,), name="INPUT1", dtype=self.dtype, scope="global"
         )
         buffer_out = self.bp.Buffer(
-            shape=self.shape, name="OUTPUT", dtype=self.dtype, scope="global"
+            shape=(self.length,), name="OUTPUT", dtype=self.dtype, scope="global"
         )
         task_id = self.bp.taskId
         # declare on-chip buffer
@@ -117,5 +108,5 @@ def compute_body(self):
 def build_add(dtype=None, target=None):
     # tasktype fixed in UNION1
     task_num = 4
-    f = Add(SHAPE, dtype, target, task_num).compute_body()
+    f = Add(dtype, target, task_num).compute_body()
     return f
diff --git a/bangpy-ops/ops/add/test_add.py b/bangpy-ops/ops/add/test_add.py
@@ -28,11 +28,16 @@
 from bangpy.common import utils, load_op_by_type
 from bangpy.platform.bang_config import ALIGN_LENGTH, TARGET
 from bangpy.tcp.runtime import TaskType
-from add import DTYPES, SHAPE, KERNEL_NAME, TARGET_LIST
+from add import DTYPES, KERNEL_NAME, TARGET_LIST
 
 
 @pytest.mark.parametrize(
-    "shape", [SHAPE],
+    "shape", 
+    [
+        (2048,),
+        (4096,),
+        (6144,),
+    ],
 )
 @pytest.mark.parametrize(
     "dtype", DTYPES,
@@ -54,5 +59,5 @@ def test_add(target, shape, dtype):
     f1 = load_op_by_type(KERNEL_NAME, dtype.name)
     f1(data_in0_dev, data_in1_dev, data_out_dev)
     bangpy.assert_allclose(
-        data_out_dev.numpy(), data_out.astype(dtype.as_numpy_dtype), rtol=1e-3
+        data_out_dev.numpy(), data_out.astype(dtype.as_numpy_dtype)
     )
diff --git a/bangpy-ops/ops/nms/nms.py b/bangpy-ops/ops/nms/nms.py
@@ -266,7 +266,7 @@ def nms_compute_body(self, task_num):
             with self.tcp.if_scope(self.tcp.taskId == task_num - 1):
                 self.tcp.assign(self.x1, 0)
                 self.tcp.memcpy(
-                    self.output[self.output_box_num : self.max_output_size], self.x1
+                    self.output[self.output_box_num : self.max_output_size], self.x1[:]
                 )
 
     def nms_compute(self):

diff --git a/docs/bangpy-docs/BANGPy-OPS算子设计文档模板.md b/docs/bangpy-docs/BANGPy-OPS算子设计文档模板.md
@@ -38,25 +38,25 @@
 
 example:
 
-| 算子功能简介                                                                 | 简要填写算子功能，详细描述在 1.2 中进行说明                                                                                                                                                                                                |
-| ---------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| 需求来源                                                                     | PyTorch/Tensorflow/...                                                                                                                                                                                                                     |
-| 应用网络                                                                     | resnet50/...                                                                                                                                                                                                                               |
-| 输入数据类型                                                                 | half, float                                                                                                                                                                                                                                |
-| 输入 Shape                                                                   | input1: [batches, hi, wi, channels]; input2: [batches, 4]                                                                                                                                                                                  |
-| 输入 Layout                                                                  | input1: NHWC; input2: ARRAY                                                                                                                                                                                                                |
-| 输出数据类型                                                                 | half, float...                                                                                                                                                                                                                             |
-| 输出 Shape                                                                   | [batches, ho, wo, channels]                                                                                                                                                                                                                |
-| 输出 Layout                                                                  | NHWC                                                                                                                                                                                                                                       |
-| 模式(可选）                                                                  |                                                                                                                                                                                                                                            |
-| 是否含有 dim/axis 等类似语义的参数且该参数支持负数/其他特殊处理              | 有 dim/axis 参数且需要支持负数 / 不含带 dim/axis 语义的参数等<br>(例如 scatter 算子接口中的 dim 参数支持为负，当 dim=-1 时，实际在最低维上做计算)                                                                                          |
+| 算子功能简介   | 简要填写算子功能，详细描述在 1.2 中进行说明                       |
+| ------------ | ----------------------------------------------------------- |
+| 需求来源       | PyTorch/Tensorflow/...                                     |
+| 应用网络       | resnet50/...                                               |
+| 输入数据类型   | half, float                                                 |
+| 输入 Shape    | input1: [batches, hi, wi, channels]; input2: [batches, 4]  |
+| 输入 Layout   | input1: NHWC; input2: ARRAY                                |
+| 输出数据类型    | half, float...                                             |
+| 输出 Shape    | [batches, ho, wo, channels]                                 |
+| 输出 Layout   | NHWC                                                        |
+| 模式(可选）    |                                                             |
+| 是否含有 dim/axis 等类似语义的参数且该参数支持负数/其他特殊处理  |  有 dim/axis 参数且需要支持负数 / 不含带 dim/axis 语义的参数等<br>(例如 scatter 算子接口中的 dim 参数支持为负，当 dim=-1 时，实际在最低维上做计算)    |
 | 是否含有 labels/index 等类似语义的参数且该参数支持负数/界外情况/其他特殊处理 | 有 labels 参数，labels 取值在 dim 范围之外时输出为 Nan / 有 index 参数，要求 index 支持负数 / 不含带 labels/index 语义的参数等<br>(例如 sparse_softmax_ce_logits 算子支持 label 取值在 dim 范围之外，advanced_index 算子支持负数 index 等) |
-| 是否需要支持原位                                                             | 是/否                                                                                                                                                                                                                                      |
-| 是否需要支持 stride 机制                                                     | 是/否                                                                                                                                                                                                                                      |
-| 是否需要支持广播                                                             | 是/否 (若是，列清楚具体哪些参数要支持)                                                                                                                                                                                                     |
-| 0 元素检查是否直接返回                                                       | 是/否                                                                                                                                                                                                                                      |
-| 其他特殊需求(在线量化，融合，转数提前等，可选)                               |                                                                                                                                                                                                                                            |
-| 本次开发优先支持的规模/模式                                                  | 优先支持 xxx 模式/NHWC 的 layout                                                                                                                                                                                                           |
+| 是否需要支持原位          | 是/否                                             |
+| 是否需要支持 stride 机制  | 是/否                                             |
+| 是否需要支持广播          | 是/否 (若是，列清楚具体哪些参数要支持)                 |
+| 0 元素检查是否直接返回     | 是/否                                            |
+| 其他特殊需求(在线量化，融合，转数提前等，可选) |                                  |
+| 本次开发优先支持的规模/模式  | 优先支持 xxx 模式/NHWC 的 layout                  |
 
 ### 1.2 算子功能和应用场景描述
 
@@ -87,17 +87,17 @@ example:
 
 example:
 
-| 限制类型     | 详细说明                                                                                                        |
-| ------------ | --------------------------------------------------------------------------------------------------------------- |
-| 数据类型限制 | 不支持 input 和 output 同时为 half                                                                              |
+| 限制类型     | 详细说明                                                                                            |
+| ------------ | ------------------------------------------------------------------------------------------------- |
+| 数据类型限制 | 不支持 input 和 output 同时为 half                                                                    |
 | 布局限制     | 不支持 NCHW 的 Layout, 需要明确感知 layout 的算子才需要说明限制, 对于不需要感知 layout 的算子, 不要加额外的防呆 |
-| 规模限制     | output_size <= 384KB                                                                                            |
-| 功能限制     | 不支持 xxx 模式/不支持 xx 和 xxx 的模式组合                                                                     |
-| 数据范围限制 | half 类型下，数据需满足[xx, xx]范围，否则有精度问题                                                             |
-| 原位限制     | 不支持原位                                                                                                      |
-| stride 限制  | 不支持 stride 机制                                                                                              |
-| 广播限制     | xxx 参数不支持广播                                                                                              |
-| xx 限制      | xxx                                                                                                             |
+| 规模限制     | output_size <= 384KB                                                                               |
+| 功能限制     | 不支持 xxx 模式/不支持 xx 和 xxx 的模式组合                                                             |
+| 数据范围限制 | half 类型下，数据需满足[xx, xx]范围，否则有精度问题                                                       |
+| 原位限制     | 不支持原位                                                                                          |
+| stride 限制  | 不支持 stride 机制                                                                                  |
+| 广播限制     | xxx 参数不支持广播                                                                                   |
+| xx 限制      | xxx                                                                                               |
 
 ### 1.5 验收标准
 
@@ -160,12 +160,12 @@ example:
 
 1、资源分配
 
-| 表项            | 分配策略                                                               |
-| --------------- | ---------------------------------------------------------------------- |
+| 表项            | 分配策略                                                              |
+| --------------- | ------------------------------------------------------------------- |
 | NRAM            | 举例: 保存神经元数据                                                   |
 | WRAM            | 举例: 保存权值数据                                                     |
-| SRAM            | 举例：暂时存储 IO 的输入输出数据，权值最先 load 在 SM 上然后 broadcast |
-| DRAM(workspace) | 举例：存储计算中的临时数据，比如 ci 拆分情况下的部分和                 |
+| SRAM            | 举例：暂时存储 IO 的输入输出数据，权值最先 load 在 SM 上然后 broadcast      |
+| DRAM(workspace) | 举例：存储计算中的临时数据，比如 ci 拆分情况下的部分和                       |
 
 2、流水设计