Documentation extraction tool

weichaojie · Jul 21, 2015 · ad06e76 · ad06e76
1 parent 2ac80e7
commit ad06e76
Show file tree

Hide file tree

Showing 3 changed files with 201 additions and 0 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "tools/clang"]
+	path = tools/clang
+	url = https://github.com/wjakob/clang-cindex-python3
diff --git a/tools/clang b/tools/clang
diff --git a/tools/mkdoc.py b/tools/mkdoc.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+#
+#  Syntax: mkdoc.py [-I<path> ..] [.. a list of header files ..]
+#
+#  Extract documentation from C++ header files to use it in Python bindings
+#
+
+import os, sys, platform, re, textwrap
+from clang import cindex
+from clang.cindex import CursorKind
+from collections import OrderedDict
+
+if platform.system() == 'Darwin':
+    libclang = '/opt/llvm/lib/libclang.dylib'
+    if os.path.exists(libclang):
+        cindex.Config.set_library_path(os.path.dirname(libclang))
+
+RECURSE_LIST = [
+    CursorKind.TRANSLATION_UNIT,
+    CursorKind.NAMESPACE,
+    CursorKind.CLASS_DECL,
+    CursorKind.STRUCT_DECL,
+    CursorKind.CLASS_TEMPLATE
+]
+
+PRINT_LIST = [
+    CursorKind.CLASS_DECL,
+    CursorKind.STRUCT_DECL,
+    CursorKind.CLASS_TEMPLATE,
+    CursorKind.FUNCTION_DECL,
+    CursorKind.FUNCTION_TEMPLATE,
+    CursorKind.CXX_METHOD,
+    CursorKind.CONSTRUCTOR,
+    CursorKind.FIELD_DECL
+]
+
+CPP_OPERATORS = {
+    '<=' : 'le', '>=' : 'ge', '==' : 'eq', '!=' : 'ne', '[]' : 'array',
+    '+=' : 'iadd', '-=' : 'isub', '*=' : 'imul', '/=' : 'idiv', '%=' :
+    'imod', '&=' : 'iand', '|=' : 'ior', '^=' : 'ixor', '<<=' : 'ilshift',
+    '>>=' : 'irshift', '++' : 'inc', '--' : 'dec', '<<' : 'lshift', '>>' :
+    'rshift', '&&' : 'land', '||' : 'lor', '!' : 'lnot', '~' : 'bnot', '&'
+    : 'band', '|' : 'bor', '+' : 'add', '-' : 'sub', '*' : 'mul', '/' :
+    'div', '%' : 'mod', '<' : 'lt', '>' : 'gt', '=' : 'assign'
+}
+CPP_OPERATORS = OrderedDict(sorted(CPP_OPERATORS.items(), key=lambda t: -len(t[0])))
+
+registered_names = dict()
+
+def d(s):
+    return s.decode('utf8')
+
+def sanitize_name(name):
+    global registered_names
+    for k, v in CPP_OPERATORS.items():
+        name = name.replace('operator%s' % k, 'operator_%s' % v)
+    name = name.replace('<', '_')
+    name = name.replace('>', '_')
+    name = name.replace(' ', '_')
+    name = name.replace(',', '_')
+    if name in registered_names:
+        registered_names[name] += 1
+        name += '_' + str(registered_names[name])
+    else:
+        registered_names[name] = 1
+    return '__doc_' + name
+
+def process_comment(comment):
+    result = ''
+
+    # Remove C++ comment syntax
+    for s in comment.splitlines():
+        s = s.strip()
+        if s.startswith('/*'):
+            s = s[2:].lstrip('* \t')
+        elif s.endswith('*/'):
+            s = s[:-2].rstrip('* \t')
+        elif s.startswith('///'):
+            s = s[3:]
+        if s.startswith('*'):
+            s = s[1:]
+        result += s.strip() + '\n'
+
+    # Doxygen tags
+    cpp_group = '([\w:]+)'
+    param_group = '([\[\w:\]]+)'
+
+    s = result
+    s = re.sub(r'\\c\s+%s' % cpp_group, r'``\1``', s)
+    s = re.sub(r'\\a\s+%s' % cpp_group, r'*\1*', s)
+    s = re.sub(r'\\e\s+%s' % cpp_group, r'*\1*', s)
+    s = re.sub(r'\\em\s+%s' % cpp_group, r'*\1*', s)
+    s = re.sub(r'\\b\s+%s' % cpp_group, r'**\1**', s)
+    s = re.sub(r'\\param%s?\s+%s' % (param_group, cpp_group), r'\n\n$Parameter "\2":\n\n', s)
+
+    for in_, out_ in {
+        'return' : 'Returns',
+        'author' : 'Author',
+        'authors' : 'Authors',
+        'copyright' : 'Copyright',
+        'date' : 'Date',
+        'remark' : 'Remark',
+        'sa' : 'See also',
+        'see' : 'See also',
+        'extends' : 'Extends',
+        'throw' : 'Throws',
+        'throws' : 'Throws' }.items():
+        s = re.sub(r'\\%s\s*' % in_, r'\n\n$%s:\n\n' % out_, s)
+
+    s = re.sub(r'\\details\s*', r'\n\n', s)
+    s = re.sub(r'\\brief\s*', r'', s)
+    s = re.sub(r'\\short\s*', r'', s)
+    s = re.sub(r'\\ref\s*', r'', s)
+
+    # HTML tags 
+    s = re.sub(r'<tt>([^<]*)</tt>', r'``\1``', s)
+    s = re.sub(r'<em>([^<]*)</em>', r'*\1*', s)
+    s = re.sub(r'<b>([^<]*)</b>', r'**\1**', s)
+
+    s = s.replace('``true``', '``True``')
+    s = s.replace('``false``', '``False``')
+
+    # Re-flow text
+    wrapper = textwrap.TextWrapper()
+    wrapper.expand_tabs = True
+    wrapper.replace_whitespace = True
+    wrapper.width = 75
+    wrapper.initial_indent = wrapper.subsequent_indent = ''
+
+    result = ''
+    for x in re.split(r'\n{2,}', s):
+        wrapped = wrapper.fill(x.strip())
+        if len(wrapped) > 0 and wrapped[0] == '$':
+            result += wrapped[1:] + '\n'
+            wrapper.initial_indent = wrapper.subsequent_indent = ' '*4
+        else:
+            result += wrapped + '\n\n'
+            wrapper.initial_indent = wrapper.subsequent_indent = ''
+    return result.rstrip()
+
+
+def extract(filename, node, prefix):
+    num_extracted = 0
+    if not (node.location.file is None or os.path.samefile(d(node.location.file.name), filename)):
+        return 0
+    if node.kind in RECURSE_LIST:
+        sub_prefix = prefix
+        if node.kind != CursorKind.TRANSLATION_UNIT:
+            if len(sub_prefix) > 0:
+                sub_prefix += '_'
+            sub_prefix += d(node.spelling)
+        for i in node.get_children():
+            num_extracted += extract(filename, i, sub_prefix)
+        if num_extracted == 0:
+            return 0
+    if node.kind in PRINT_LIST:
+        comment = d(node.raw_comment) if node.raw_comment is not None else ''
+        comment = process_comment(comment)
+        name = sanitize_name(prefix + '_' + d(node.spelling))
+        print('\nstatic const char *%s = %sR"doc(%s)doc";' % (name, '\n' if '\n' in comment else '', comment))
+        num_extracted += 1
+    return num_extracted
+
+if __name__ == '__main__':
+    parameters = ['-x', 'c++', '-std=c++11']
+    filenames = []
+
+    for item in sys.argv[1:]:
+        if item.startswith('-'):
+            parameters.append(item)
+        else:
+            filenames.append(item)
+
+    if len(filenames) == 0:
+        print('Syntax: %s [.. a list of header files ..]' % sys.argv[0])
+        exit(-1)
+
+    print('''/*
+  This file contains docstrings for the Python bindings.
+  Do not edit! These were automatically extracted by mkdoc.py
+ */
+
+#define __COUNT(_1, _2, _3, _4, _5, COUNT, ...)  COUNT
+#define __VA_SIZE(...)                           __COUNT(__VA_ARGS__, 5, 4, 3, 2, 1)
+#define __CAT1(a, b)                             a ## b
+#define __CAT2(a, b)                             __CAT1(a, b)
+#define __DOC1(n1)                               __doc_##n1
+#define __DOC2(n1, n2)                           __doc_##n1##_##n2
+#define __DOC3(n1, n2, n3)                       __doc_##n1##_##n2##_##n3
+#define __DOC4(n1, n2, n3, n4)                   __doc_##n1##_##n2##_##n3##_##n4
+#define __DOC5(n1, n2, n3, n4, n5)               __doc_##n1##_##n2##_##n3##_##n4_##n5
+#define DOC(...)                                 __CAT2(__DOC, __VA_SIZE(__VA_ARGS__))(__VA_ARGS__)''')
+    for filename in filenames:
+        print('Processing "%s"..' % filename, file = sys.stderr)
+        index = cindex.Index(cindex.conf.lib.clang_createIndex(False, True))
+        tu = index.parse(filename, parameters)
+        extract(filename, tu.cursor, '')