Add Cpp Doc Generate tools (PaddlePaddle#5900)

* CAPItools * add note
MarisaSparkL · Jun 13, 2023 · 9d695ac · 9d695ac
1 parent 9dca23d
commit 9d695ac
Show file tree

Hide file tree

Showing 5 changed files with 877 additions and 0 deletions.
diff --git a/ci_scripts/CAPItools/README.md b/ci_scripts/CAPItools/README.md
@@ -0,0 +1,55 @@
+# CAPI tools
+CAPI tools 用于一键生成 C++ 的 rst 文档。
+
+## 调用方式
+```python
+python main.py <source dir> <target dir>
+```
+
+若不设置`source dir`和`target dir`，则默认先查找已安装的`paddlepaddle`包环境。
+
+其中：
+- source dir 是安装后的 Paddle C++ API 声明路径。 例如`venv/Lib/site-packages/paddle/include/paddle`。
+- target dir 目标文件保存路径。
+
+最终生成结果如下所示：
+```python
+target dir
+| -cn
+    |- index.rst
+    |- Paddle
+        |- fluid
+        |- phi
+        |- ...
+| -en
+    |- index.rst
+    |- Paddle
+        |- fluid
+        |- phi
+        |- ...
+```
+
+## 获取最新 PaddlePaddle
+pip install python -m pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/windows/cpu-mkl-avx/develop.html
+
+## 代码结构
+
+### `main.py`文件主要用于处理和筛选包文件, 并调用`utils_helper.py`中的函数进行文件生成
+```python
+def analysis_file() # 用于解析文件内容(多线程不安全)
+
+def generate_docs() # 用于创建目录并传值给 utils_helper.py 中的函数进行文件生成
+
+def cpp2py() # 用于筛选出 cpp api 和 py api 相对应的函数名称
+```
+
+### `utils_helper.py`文件主要存放函数生成、解析, 以及文件写入的工作
+```python
+
+class func_helper(object) # 用于生成和解析方法
+    decode() # 用于解析输出输出参数、函数名称、返回值、函数注释信息
+class class_helper(object) # 用于生成和解析类
+    decode() # 同 func_helper()
+
+def generate_overview() # 用于生成 overview.rst 文件
+```
diff --git a/ci_scripts/CAPItools/main.py b/ci_scripts/CAPItools/main.py
@@ -0,0 +1,150 @@
+# python main.py [source dir] [target dir]
+# python main.py ../paddle .
+
+
+import CppHeaderParser
+import json
+import os
+import traceback
+import sys
+import re
+
+from utils_helper import func_helper, class_helper, generate_overview
+from utils import get_PADDLE_API_class, get_PADDLE_API_func
+
+
+# 解析所有的函数, 类, 枚举, 返回一个字典
+# 多线程使用并不安全, 请不要使用多线程
+def analysis_file(path):
+    header = CppHeaderParser.CppHeader(path, encoding='utf8')
+    data = json.loads(header.toJSON())
+    return data
+
+
+# 生成文件
+# 根据给定的list内容，生成对应的文档信息
+def generate_docs(
+    all_funcs, all_class, cpp2py_api_list, save_dir, LANGUAGE="cn"
+):
+    for item in all_funcs:
+        path = item["filename"].replace("../", "").replace(".h", "")
+        dir_path = os.path.join(save_dir, LANGUAGE, path)
+        if not os.path.exists(dir_path):
+            os.makedirs(dir_path)
+
+        # 这个反斜杠需要单独处理, 在 linux 下
+        func_name = item["name"].replace("/", "")
+
+        # Note: 操作符仅不生成rst，实际上在Overview列表依然会呈现以提示存在此操作符
+        if func_name.startswith('operator'):
+            checkwords = func_name.replace('operator', '', 1)
+            if re.search(r"\w", checkwords) == None:
+                continue  # 跳过操作符声明
+        rst_dir = os.path.join(save_dir, LANGUAGE, path, func_name + ".rst")
+        # avoid a filename such as operate*.rst, only windows
+        try:
+            helper = func_helper(item, cpp2py_api_list)
+            helper.create_and_write_file(rst_dir, LANGUAGE)
+        except:
+            print(traceback.format_exc())
+            print('FAULT GENERATE:' + rst_dir)
+
+    for item in all_class:
+        path = item["filename"].replace("../", "").replace(".h", "")
+        dir_path = os.path.join(save_dir, LANGUAGE, path)
+        if not os.path.exists(dir_path):
+            os.makedirs(dir_path)
+
+        func_name = item["name"].replace("PADDLE_API", "")
+        rst_dir = os.path.join(save_dir, LANGUAGE, path, func_name + ".rst")
+        try:
+            helper = class_helper(item)
+            helper.create_and_write_file(rst_dir, LANGUAGE)
+        except:
+            print(traceback.format_exc())
+            print('FAULT GENERATE:' + rst_dir)
+
+
+# cpp 对应 python api
+# 用于存储 api 的名称, 用于后续生成对应python api文档链接
+def cpp2py(data: dict):
+    cpp2py_api_list = []
+    for i in data["using"]:
+        cpp2py_api_list.append(i.replace("paddle::", ""))
+
+    return cpp2py_api_list
+
+
+# 运行主函数，主要流程如下
+# 1. 确定生成的目录
+# 2. 提取待生成文档的PADDLE_API list
+# 3. 生成文档
+if __name__ == "__main__":
+    root_dir = ''
+    save_dir = '.'  # 默认保存在当前目录
+    if len(sys.argv) == 3:
+        root_dir = sys.argv[1]
+        save_dir = sys.argv[2]
+
+    if root_dir == '':
+        try:
+            import paddle
+            import inspect
+
+            root_dir = os.path.dirname(inspect.getsourcefile(paddle))
+        except:
+            # for simple run
+            root_dir = '../paddle'
+            save_dir = '.'  # 默认保存在当前目录
+
+    all_funcs = []
+    all_class = []
+    cpp2py_api_list = []
+    overview_list = []
+    for home, dirs, files in os.walk(root_dir):
+        for file_name in files:
+            # 跳过不需要处理的文件
+            if file_name.split(".")[-1] not in ["cc", "cu", "h"]:
+                continue
+
+            file_path = os.path.join(home, file_name)
+            # 处理 cpp 和 py api对应的文件, 目前只有这个文件内的 cpp api和 python api是对应的
+            if file_name == "tensor_compat.h":
+                cpp2py_data = analysis_file(file_path)
+                cpp2py_api_list = cpp2py(cpp2py_data).copy()
+
+            # 跳过文件中未包含PADDLE_API
+            with open(file_path, encoding='utf-8') as f:
+                if 'PADDLE_API ' not in f.read():
+                    continue
+
+            print("Parsing: ", file_path)
+            data = analysis_file(file_path)
+
+            # 信息抽取
+            current_func = get_PADDLE_API_func(data)
+            current_class = get_PADDLE_API_class(data)
+
+            # 信息记录
+            all_funcs.extend(current_func)
+            all_class.extend(current_class)
+            overview_list.append(
+                {
+                    'h_file': file_path,
+                    'class': current_class,
+                    'function': current_func,
+                }
+            )
+
+    # 生成文档
+    generate_docs(all_funcs, all_class, cpp2py_api_list, save_dir, "cn")
+    generate_docs(all_funcs, all_class, cpp2py_api_list, save_dir, "en")
+
+    # 生成 overview
+    generate_overview(overview_list, save_dir, "cn")
+    generate_overview(overview_list, save_dir, "en")
+
+    # 统计信息
+    print("PADDLE_API func count: ", len(all_funcs))
+    print("PADDLE_API class count: ", len(all_class))
+    print("cpp2py api count: ", len(cpp2py_api_list))
diff --git a/ci_scripts/CAPItools/requirements.txt b/ci_scripts/CAPItools/requirements.txt
@@ -0,0 +1,2 @@
+robotpy-cppheaderparser==5.1.0
+# paddle
diff --git a/ci_scripts/CAPItools/utils.py b/ci_scripts/CAPItools/utils.py
@@ -0,0 +1,94 @@
+# 获取存在 PADDLE_API func 数组的名称
+# CppHeaderParser 解析后以字典形式保存数据，'debug' 字段中保存了原始信息
+# 如果 PADDLE_API 在字段中，则表明该 API 是外部暴露的函数
+def get_PADDLE_API_func(data: dict):
+    result = []
+    for i in data["functions"]:
+        if 'PADDLE_API' in i['debug']:
+            result.append(i)
+    return result
+
+
+# 获取存在 PADDLE_API class 数组的名称
+# CppHeaderParser 解析后以字典形式保存数据
+# 如果 PADDLE_API 在字段中，则表明该 API 是外部暴露的类
+def get_PADDLE_API_class(data: dict):
+    result = []
+    for classname in data["classes"]:
+        # Note 目前没有 PADDLE_API 是 struct 的
+        if data["classes"][classname]["declaration_method"] == "struct":
+            continue
+
+        # Note 这里需要处理一下, 因为类名和 PADDLE_API 会粘在一起, 例: PADDLE_APIDeviceContextPool
+        if "PADDLE_API" in classname:
+            result.append(data["classes"][classname])
+    return result
+
+
+# 获取方法中的参数parameters
+# 根据解析的参数字典，添加对应的参数名、参数类型、说明
+# 有时候会将“&”解析为参数名，需要特殊处理
+def get_parameters(parameters):
+    # parameter_api = ""  # 这里解析是给api使用的 (暂时不用)
+    parameter_dict = {}
+    for i in parameters:
+        parameter_type_tmp = i['type'].replace(" &", "").replace(" *", "")
+        # * 和 & 情况
+        # parameter_api += parameter_type_tmp
+
+        # 添加引用
+        parameter_type_tmp += "&" * i["reference"]
+        if i["pointer"] == 1:
+            # parameter_api += "*"
+            parameter_type_tmp += "*"
+        if i["constant"] == 1 and not parameter_type_tmp.startswith('const'):
+            parameter_type_tmp = "const " + parameter_type_tmp
+        # parameter_api += f" {i['name']}, "
+        desc = i.get('desc', '').replace('  ', '')
+
+        # special progress for none parameter name case
+        if i['name'] == '&':
+            continue
+        else:
+            parameter_dict[i['name']] = {
+                'type': parameter_type_tmp,
+                'intro': desc,
+            }
+        # parameter += f"\t- **{i['name']}** ({parameter_type_tmp}) - {desc}\n"
+    # 去掉末尾的逗号
+    # parameter_api = parameter_api[:-2]
+    # return parameter, parameter_api
+    return parameter_dict
+
+
+# 将注释内容解析为说明字典
+# 解析前: @brief Construct a Tensor from a buffer pointed to by `data` @note `from_blob` doesn’t copy or move data, Modifying the constructed tensor is equivalent to modifying the original data. @param data The pointer to the memory buffer. @param shape The dims of the tensor. @param dtype The data type of the tensor, should correspond to data type of`data`. See PD_FOR_EACH_DATA_TYPE in `phi/common/data_type.h` @param layout The data layout of the tensor. @param place The place where the tensor is located.If `place` is default value, it will be inferred from `data`,However, the feature is only supported on CPU or GPU.If `place` is not default value, make sure that `place` is equalto the place of `data` @param deleter A function or function object that will be called to free thememory buffer. @return A Tensor object constructed from the buffer
+# 以@作为分隔符，索引关键字包括'brief'、'note'、'return'、'param'
+# 解析后分别将对应关键字后的内容放入字典对应关键字后
+def parse_doxygen(doxygen):
+    doxygen_dict = {
+        'intro': '',
+        'returns': '',
+        'param_intro': {},
+        'note': '',
+    }
+
+    if '@' in doxygen:
+        doxygen = doxygen[doxygen.find('@') :]
+        for doxygen_part in doxygen.split('@'):
+            if doxygen_part.startswith('brief '):
+                doxygen_dict['intro'] = doxygen_part.replace('brief ', '', 1)
+            elif doxygen_part.startswith('return '):
+                doxygen_dict['returns'] = doxygen_part.replace('return ', '', 1)
+            elif doxygen_part.startswith('param '):
+                param_intro = doxygen_part.replace('param ', '', 1)
+                param_name = param_intro[: param_intro.find(' ')]
+                doxygen_dict['param_intro'][param_name] = param_intro[
+                    param_intro.find(' ') + 1 :
+                ]
+            elif doxygen_part.startswith('note '):
+                doxygen_dict['note'] = doxygen_part.replace('note ', '', 1)
+            else:
+                pass
+
+    return doxygen_dict