[Change]修改目前 Paddle docs 的代码检查方式 (PaddlePaddle#5962)

leo0519 · Jul 1, 2023 · 63b2bfa · 63b2bfa
1 parent dd6138a
commit 63b2bfa
Show file tree

Hide file tree

Showing 2 changed files with 159 additions and 2 deletions.
diff --git a/ci_scripts/chinese_samplecode_processor.py b/ci_scripts/chinese_samplecode_processor.py
@@ -1,3 +1,4 @@
+import re
 import math
 import os
 import pickle
@@ -83,6 +84,82 @@ def find_all(src_str, substr):
     return indices
 
 
+def _hasprefix(line, prefixes):
+    """helper prefix test"""
+    return any(line == p or line.startswith(p + ' ') for p in prefixes)
+
+
+def strip_ps_from_codeblock(codeblock):
+    """strip PS1(>>> ) and PS2(... ) from codeblock
+
+    Note:
+        These two functions should be updated simultaneously:
+        `docs/docs/api/gen_doc.py :: strip_ps_from_codeblock`
+        `docs/ci_scripts/chinese_samplecode_processor.py :: strip_ps_from_codeblock`
+    """
+    match_obj = re.search(r"\n>>>\s?", "\n" + codeblock)
+    if match_obj is None:
+        return codeblock
+
+    TEXT = 'text'  # text of codeblock
+    CODE_PS1 = 'code_ps1'  # PS1(>>> ) of codeblock
+    CODE_PS2 = 'code_ps2'  # PS2(... ) of codeblock
+
+    previous_state = TEXT
+    current_state = None
+
+    codeblock_clean = []
+    for line in codeblock.splitlines():
+        strip_line = line.strip()
+
+        if previous_state == TEXT:
+            # text transitions to source whenever a PS1 line is encountered
+            if _hasprefix(strip_line, ('>>>',)):
+                current_state = CODE_PS1
+            else:
+                current_state = TEXT
+
+        elif previous_state in {CODE_PS1, CODE_PS2}:
+            if len(strip_line) == 0:
+                current_state = TEXT
+
+            # allow source to continue with either PS1 or PS2
+            elif _hasprefix(strip_line, ('>>>', '...')):
+                if strip_line == '...':
+                    if previous_state == CODE_PS2:
+                        current_state = CODE_PS2
+                    else:
+                        current_state = TEXT
+                else:
+                    if _hasprefix(strip_line, ('...',)):
+                        current_state = CODE_PS2
+                    else:
+                        current_state = CODE_PS1
+            else:
+                current_state = TEXT
+        else:
+            raise AssertionError(
+                'Unknown state previous_state={}'.format(previous_state)
+            )
+
+        lstrip_line = line.lstrip()
+        if current_state in {CODE_PS1, CODE_PS2}:
+            match_obj = re.match(r"^\.\.\.\s?|^>>>\s?", lstrip_line)
+            codeblock_clean.append(lstrip_line[match_obj.end() :])
+
+        elif current_state == TEXT:
+            codeblock_clean.append("# {}".format(line))
+
+        else:
+            raise AssertionError(
+                'Unknown state current_state={}'.format(current_state)
+            )
+
+        previous_state = current_state
+
+    return "\n".join(codeblock_clean)
+
+
 def extract_sample_code(srcfile, status_all):
     filename = srcfile.name
     srcc = srcfile.read()
@@ -129,7 +206,9 @@ def extract_sample_code(srcfile, status_all):
                     if srcls[j].find(" code-block:: python") != -1:
                         break
                     content += srcls[j].replace(startindent, "", 1)
-                status.append(run_sample_code(content, filename))
+                status.append(
+                    run_sample_code(strip_ps_from_codeblock(content), filename)
+                )
 
     status_all[filename] = status
     return status_all

diff --git a/docs/api/gen_doc.py b/docs/api/gen_doc.py
@@ -1025,6 +1025,82 @@ def filter_out_object_of_api_info_dict():
             del api_info_dict[id_api]['object']
 
 
+def _hasprefix(line, prefixes):
+    """helper prefix test"""
+    return any(line == p or line.startswith(p + ' ') for p in prefixes)
+
+
+def strip_ps_from_codeblock(codeblock):
+    """strip PS1(>>> ) and PS2(... ) from codeblock
+
+    Note:
+        These two functions should be updated simultaneously:
+        `docs/docs/api/gen_doc.py :: strip_ps_from_codeblock`
+        `docs/ci_scripts/chinese_samplecode_processor.py :: strip_ps_from_codeblock`
+    """
+    match_obj = re.search(r"\n>>>\s?", "\n" + codeblock)
+    if match_obj is None:
+        return codeblock
+
+    TEXT = 'text'  # text of codeblock
+    CODE_PS1 = 'code_ps1'  # PS1(>>> ) of codeblock
+    CODE_PS2 = 'code_ps2'  # PS2(... ) of codeblock
+
+    previous_state = TEXT
+    current_state = None
+
+    codeblock_clean = []
+    for line in codeblock.splitlines():
+        strip_line = line.strip()
+
+        if previous_state == TEXT:
+            # text transitions to source whenever a PS1 line is encountered
+            if _hasprefix(strip_line, ('>>>',)):
+                current_state = CODE_PS1
+            else:
+                current_state = TEXT
+
+        elif previous_state in {CODE_PS1, CODE_PS2}:
+            if len(strip_line) == 0:
+                current_state = TEXT
+
+            # allow source to continue with either PS1 or PS2
+            elif _hasprefix(strip_line, ('>>>', '...')):
+                if strip_line == '...':
+                    if previous_state == CODE_PS2:
+                        current_state = CODE_PS2
+                    else:
+                        current_state = TEXT
+                else:
+                    if _hasprefix(strip_line, ('...',)):
+                        current_state = CODE_PS2
+                    else:
+                        current_state = CODE_PS1
+            else:
+                current_state = TEXT
+        else:
+            raise AssertionError(
+                'Unknown state previous_state={}'.format(previous_state)
+            )
+
+        lstrip_line = line.lstrip()
+        if current_state in {CODE_PS1, CODE_PS2}:
+            match_obj = re.match(r"^\.\.\.\s?|^>>>\s?", lstrip_line)
+            codeblock_clean.append(lstrip_line[match_obj.end() :])
+
+        elif current_state == TEXT:
+            codeblock_clean.append("# {}".format(line))
+
+        else:
+            raise AssertionError(
+                'Unknown state current_state={}'.format(current_state)
+            )
+
+        previous_state = current_state
+
+    return "\n".join(codeblock_clean)
+
+
 def extract_code_blocks_from_docstr(docstr, google_style=True):
     """
     extract code-blocks from the given docstring.
@@ -1082,7 +1158,9 @@ def _append_code_block(in_examples):
         # nonlocal code_blocks, cb_cur, cb_cur_name, cb_cur_seq_id, cb_required
         code_blocks.append(
             {
-                'codes': inspect.cleandoc("\n".join(cb_info['cb_cur'])),
+                'codes': strip_ps_from_codeblock(
+                    inspect.cleandoc("\n" + "\n".join(cb_info['cb_cur']))
+                ),
                 'name': cb_info['cb_cur_name'],
                 'id': cb_info['cb_cur_seq_id'],
                 'required': cb_info['cb_required'],