utils/style_doc.py

# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Style utils for the .rst and the docstrings."""

import argparse
import os
import re
import warnings
from enum import Enum


# Special blocks where the inside should be formatted.
TEXTUAL_BLOCKS = ["note", "warning"]
# List of acceptable characters for titles and sections underline.
TITLE_SPECIAL_CHARS = """= - ` : ' " ~ ^ _ * + # < >""".split(" ")
# Special words for docstrings (s? means the s is optional)
DOC_SPECIAL_WORD = [
    "Args?",
    "Params?",
    "Parameters?",
    "Arguments?",
    "Examples?",
    "Usage",
    "Returns?",
    "Raises?",
    "Attributes?",
]

# Regexes
# Matches any declaration of textual block, like `.. note::`. (ignore case to avoid writing all versions in the list)
_re_textual_blocks = re.compile(r"^\s*\.\.\s+(" + "|".join(TEXTUAL_BLOCKS) + r")\s*::\s*$", re.IGNORECASE)
# Matches list introduction in rst.
_re_list = re.compile(r"^(\s*-\s+|\s*\*\s+|\s*\d+\.\s+)")
# Matches the indent in a line.
_re_indent = re.compile(r"^(\s*)\S")
# Matches a table declaration in rst.
_re_table = re.compile(r"(\+-+)+\+\s*$")
# Matches a code block in rst `:: `.
_re_code_block = re.compile(r"^\s*::\s*$")
_re_code_block_explicit = re.compile(r"^\.\.\s+code\-block::")
# Matches any block of the form `.. something::` or `.. something:: bla`.
_re_ignore = re.compile(r"^\s*\.\.\s+(.*?)\s*::\s*\S*\s*$")
# Matches comment introduction in rst.
_re_comment = re.compile(r"\s*\.\.\s*$")
# Matches the special tag to ignore some paragraphs.
_re_doc_ignore = re.compile(r"(\.\.|#)\s*docstyle-ignore")
# Matches the example introduction in docstrings.
_re_example = re.compile(r"::\s*$")
# Matches the parameters introduction in docstrings.
_re_arg_def = re.compile(r"^\s*(Args?|Parameters?|Params|Arguments?|Environment|Attributes?)\s*:\s*$")
# Matches the return introduction in docstrings.
_re_return = re.compile(r"^\s*(Returns?|Raises?|Note)\s*:\s*$")
# Matches any doc special word.
_re_any_doc_special_word = re.compile(r"^\s*(" + "|".join(DOC_SPECIAL_WORD) + r")::?\s*$")


class SpecialBlock(Enum):
    NOT_SPECIAL = 0
    NO_STYLE = 1
    ARG_LIST = 2


def split_text_in_lines(text, max_len, prefix="", min_indent=None):
    """
    Split `text` in the biggest lines possible with the constraint of `max_len` using `prefix` on the first line and
    then indenting with the same length as `prefix`.
    """
    text = re.sub(r"\s+", " ", text)
    indent = " " * len(prefix)
    if min_indent is not None:
        if len(indent) < len(min_indent):
            indent = min_indent
        if len(prefix) < len(min_indent):
            prefix = " " * (len(min_indent) - len(prefix)) + prefix
    new_lines = []
    words = text.split(" ")
    current_line = f"{prefix}{words[0]}"
    for word in words[1:]:
        try_line = f"{current_line} {word}"
        if len(try_line) > max_len:
            new_lines.append(current_line)
            current_line = f"{indent}{word}"
        else:
            current_line = try_line
    new_lines.append(current_line)
    return "\n".join(new_lines)


def get_indent(line):
    """Get the indentation of `line`."""
    indent_search = _re_indent.search(line)
    return indent_search.groups()[0] if indent_search is not None else ""


class CodeStyler:
    """A generic class to style .rst files."""

    def is_no_style_block(self, line):
        """Whether or not `line` introduces a block where styling should be ignore"""
        if _re_code_block.search(line) is not None:
            return True
        if _re_textual_blocks.search(line) is not None:
            return False
        return _re_ignore.search(line) is not None

    def is_comment_or_textual_block(self, line):
        """Whether or not `line` introduces a block where styling should not be ignored (note, warnings...)"""
        if _re_comment.search(line):
            return True
        return _re_textual_blocks.search(line) is not None

    def is_special_block(self, line):
        """Whether or not `line` introduces a special block."""
        if self.is_no_style_block(line):
            self.in_block = SpecialBlock.NO_STYLE
            return True
        return False

    def init_in_block(self, text):
        """
        Returns the initial value for `self.in_block`.

        Useful for some docstrings beginning inside an argument declaration block (all models).
        """
        return SpecialBlock.NOT_SPECIAL

    def end_of_special_style(self, line):
        """
        Sets back the `in_block` attribute to `NOT_SPECIAL`.

        Useful for some docstrings where we may have to go back to `ARG_LIST` instead.
        """
        self.in_block = SpecialBlock.NOT_SPECIAL

    def style_paragraph(self, paragraph, max_len, no_style=False, min_indent=None):
        """
        Style `paragraph` (a list of lines) by making sure no line goes over `max_len`, except if the `no_style` flag
        is passed.
        """
        if len(paragraph) == 0:
            return ""
        if no_style or self.in_block == SpecialBlock.NO_STYLE:
            return "\n".join(paragraph)
        if _re_list.search(paragraph[0]) is not None:
            # Great, we're in a list. So we need to split our paragraphs in smaller parts, one for each item.
            result = ""
            remainder = ""
            prefix = _re_list.search(paragraph[0]).groups()[0]
            prefix_indent = get_indent(paragraph[0])
            current_item = [paragraph[0][len(prefix) :]]
            for i, line in enumerate(paragraph[1:]):
                new_item_search = _re_list.search(line)
                indent = get_indent(line)
                if len(indent) < len(prefix_indent) or (len(indent) == len(prefix_indent) and new_item_search is None):
                    # There might not be an empty line after the list, formatting the remainder recursively.
                    remainder = "\n" + self.style_paragraph(
                        paragraph[i + 1 :], max_len, no_style=no_style, min_indent=min_indent
                    )
                    break
                elif new_item_search is not None:
                    text = " ".join([l.strip() for l in current_item])
                    result += split_text_in_lines(text, max_len, prefix, min_indent=min_indent) + "\n"
                    prefix = new_item_search.groups()[0]
                    prefix_indent = indent
                    current_item = [line[len(prefix) :]]
                else:
                    current_item.append(line)
            # Treat the last item
            text = " ".join([l.strip() for l in current_item])
            result += split_text_in_lines(text, max_len, prefix, min_indent=min_indent)
            # Add the potential remainder
            return result + remainder

        if len(paragraph) > 1 and self.is_comment_or_textual_block(paragraph[0]):
            # Comments/notes in rst should be restyled with indentation, ignoring the first line.
            indent = get_indent(paragraph[1])
            text = " ".join([l.strip() for l in paragraph[1:]])
            return paragraph[0] + "\n" + split_text_in_lines(text, max_len, indent, min_indent=min_indent)

        if self.in_block == SpecialBlock.ARG_LIST:
            # Arg lists are special: we need to ignore the lines that are at the first indentation level beneath the
            # Args/Parameters (parameter description), then we can style the indentation level beneath.
            result = ""
            # The args/parameters could be in that paragraph and should be ignored
            if _re_arg_def.search(paragraph[0]) is not None:
                if len(paragraph) == 1:
                    return paragraph[0]
                result += paragraph[0] + "\n"
                paragraph = paragraph[1:]

            if self.current_indent is None:
                self.current_indent = get_indent(paragraph[1])

            current_item = []
            for line in paragraph:
                if get_indent(line) == self.current_indent:
                    if len(current_item) > 0:
                        item_indent = get_indent(current_item[0])
                        text = " ".join([l.strip() for l in current_item])
                        result += split_text_in_lines(text, max_len, item_indent, min_indent=min_indent) + "\n"
                    result += line + "\n"
                    current_item = []
                else:
                    current_item.append(line)
            if len(current_item) > 0:
                item_indent = get_indent(current_item[0])
                text = " ".join([l.strip() for l in current_item])
                result += split_text_in_lines(text, max_len, item_indent, min_indent=min_indent) + "\n"
            return result[:-1]

        indent = get_indent(paragraph[0])
        text = " ".join([l.strip() for l in paragraph])
        return split_text_in_lines(text, max_len, indent, min_indent=min_indent)

    def style(self, text, max_len=119, min_indent=None):
        """Style `text` to `max_len`."""
        new_lines = []
        paragraph = []
        self.current_indent = ""
        self.previous_indent = None
        # If one of those is True, the paragraph should not be touched (code samples, lists...)
        no_style = False
        no_style_next = False
        self.in_block = self.init_in_block(text)
        # If this is True, we force-break a paragraph, even if there is no new empty line.
        break_paragraph = False

        lines = text.split("\n")
        last_line = None
        for line in lines:
            # New paragraph
            line_is_empty = len(line.strip()) == 0
            list_begins = (
                _re_list.search(line) is not None
                and last_line is not None
                and len(get_indent(line)) > len(get_indent(last_line))
            )
            if line_is_empty or break_paragraph or list_begins:
                if len(paragraph) > 0:
                    if self.in_block != SpecialBlock.NOT_SPECIAL:
                        indent = get_indent(paragraph[0])
                        # Are we still in a no-style block?
                        if self.current_indent is None:
                            # If current_indent is None, we haven't begun the interior of the block so the answer is
                            # yes, unless we have an indent of 0 in which case the special block took one line only.
                            if len(indent) == 0:
                                self.in_block = SpecialBlock.NOT_SPECIAL
                            else:
                                self.current_indent = indent
                        elif not indent.startswith(self.current_indent):
                            # If not, we are leaving the block when we unindent.
                            self.end_of_special_style(paragraph[0])

                    if self.is_special_block(paragraph[0]):
                        # Maybe we are starting a special block.
                        if len(paragraph) > 1:
                            # If we have the interior of the block in the paragraph, we grab the indent.
                            self.current_indent = get_indent(paragraph[1])
                        else:
                            # We will determine the indent with the next paragraph
                            self.current_indent = None
                    styled_paragraph = self.style_paragraph(
                        paragraph, max_len, no_style=no_style, min_indent=min_indent
                    )
                    new_lines.append(styled_paragraph + "\n")
                else:
                    new_lines.append("")

                paragraph = []
                no_style = no_style_next
                no_style_next = False
                last_line = None
                if (not break_paragraph and not list_begins) or line_is_empty:
                    break_paragraph = False
                    continue
                break_paragraph = False

            # Title and section lines should go to the max + add a new paragraph.
            if (
                len(set(line)) == 1
                and line[0] in TITLE_SPECIAL_CHARS
                and last_line is not None
                and len(line) >= len(last_line)
            ):
                line = line[0] * max_len
                break_paragraph = True
            # proper doc comment indicates the next paragraph should be no-style.
            if _re_doc_ignore.search(line) is not None:
                no_style_next = True
            # Table are in just one paragraph and should be no-style.
            if _re_table.search(line) is not None:
                no_style = True
            paragraph.append(line)
            last_line = line

        # Just have to treat the last paragraph. It could still be in a no-style block (or not)
        if len(paragraph) > 0:
            # Are we still in a special block
            # (if current_indent is None, we are but no need to set it since we are the end.)
            if self.in_block != SpecialBlock.NO_STYLE and self.current_indent is not None:
                indent = get_indent(paragraph[0])
                if not indent.startswith(self.current_indent):
                    self.in_block = SpecialBlock.NOT_SPECIAL
            _ = self.is_special_block(paragraph[0])
            new_lines.append(self.style_paragraph(paragraph, max_len, no_style=no_style, min_indent=min_indent) + "\n")
        return "\n".join(new_lines)


class DocstringStyler(CodeStyler):
    """Class to style docstrings that take the main method from `CodeStyler`."""

    def is_no_style_block(self, line):
        if _re_textual_blocks.search(line) is not None:
            return False
        if _re_example.search(line) is not None:
            return True
        return _re_code_block.search(line) is not None

    def is_comment_or_textual_block(self, line):
        if _re_return.search(line) is not None:
            self.in_block = SpecialBlock.NOT_SPECIAL
            return True
        return super().is_comment_or_textual_block(line)

    def is_special_block(self, line):
        if self.is_no_style_block(line):
            if self.previous_indent is None and self.in_block == SpecialBlock.ARG_LIST:
                self.previous_indent = self.current_indent
            self.in_block = SpecialBlock.NO_STYLE
            return True
        if _re_arg_def.search(line) is not None:
            self.in_block = SpecialBlock.ARG_LIST
            return True
        return False

    def end_of_special_style(self, line):
        if self.previous_indent is not None and line.startswith(self.previous_indent):
            self.in_block = SpecialBlock.ARG_LIST
            self.current_indent = self.previous_indent
        else:
            self.in_block = SpecialBlock.NOT_SPECIAL
            self.previous_indent = None

    def init_in_block(self, text):
        lines = text.split("\n")
        while len(lines) > 0 and len(lines[0]) == 0:
            lines = lines[1:]
        if len(lines) == 0:
            return SpecialBlock.NOT_SPECIAL
        if re.search(r":\s*$", lines[0]):
            indent = get_indent(lines[0])
            if (
                len(lines) == 1
                or len(get_indent(lines[1])) > len(indent)
                or (len(get_indent(lines[1])) == len(indent) and re.search(r":\s*$", lines[1]))
            ):
                self.current_indent = indent
                return SpecialBlock.ARG_LIST
        return SpecialBlock.NOT_SPECIAL


rst_styler = CodeStyler()
doc_styler = DocstringStyler()


def _reindent_code_blocks(text):
    """Checks indent in code blocks is of four"""
    lines = text.split("\n")
    idx = 0
    while idx < len(lines):
        # Detect if the line is the start of a new code-block.
        if _re_code_block.search(lines[idx]) is not None or _re_code_block_explicit.search(lines[idx]) is not None:
            while len(get_indent(lines[idx])) == 0:
                idx += 1
            indent = len(get_indent(lines[idx]))
            should_continue = True
            while should_continue:
                if len(lines[idx]) > 0 and indent < 4:
                    lines[idx] = " " * 4 + lines[idx][indent:]
                idx += 1
                should_continue = (idx < len(lines)) and (len(lines[idx]) == 0 or len(get_indent(lines[idx])) > 0)
        else:
            idx += 1

    return "\n".join(lines)


def _add_new_lines_before_list(text):
    """Add a new empty line before a list begins."""
    lines = text.split("\n")
    new_lines = []
    in_list = False
    for idx, line in enumerate(lines):
        # Detect if the line is the start of a new list.
        if _re_list.search(line) is not None and not in_list:
            current_indent = get_indent(line)
            in_list = True
            # If the line before is non empty, add an extra new line.
            if idx > 0 and len(lines[idx - 1]) != 0:
                new_lines.append("")
        # Detect if we're out of the current list.
        if in_list and not line.startswith(current_indent) and _re_list.search(line) is None:
            in_list = False
        new_lines.append(line)
    return "\n".join(new_lines)


def _add_new_lines_before_doc_special_words(text):
    lines = text.split("\n")
    new_lines = []
    for idx, line in enumerate(lines):
        # Detect if the line is the start of a new list.
        if _re_any_doc_special_word.search(line) is not None:
            # If the line before is non empty, add an extra new line.
            if idx > 0 and len(lines[idx - 1]) != 0:
                new_lines.append("")
        new_lines.append(line)
    return "\n".join(new_lines)


def style_rst_file(doc_file, max_len=119, check_only=False):
    """ Style one rst file `doc_file` to `max_len`."""
    with open(doc_file, "r", encoding="utf-8", newline="\n") as f:
        doc = f.read()

    # Make sure code blocks are indented at 4
    clean_doc = _reindent_code_blocks(doc)
    # Add missing new lines before lists
    clean_doc = _add_new_lines_before_list(clean_doc)
    # Style
    clean_doc = rst_styler.style(clean_doc, max_len=max_len)

    diff = clean_doc != doc
    if not check_only and diff:
        print(f"Overwriting content of {doc_file}.")
        with open(doc_file, "w", encoding="utf-8", newline="\n") as f:
            f.write(clean_doc)

    return diff


def style_docstring(docstring, max_len=119):
    """Style `docstring` to `max_len`."""
    # One-line docstring that are not too long are left as is.
    if len(docstring) < max_len and "\n" not in docstring:
        return docstring

    # Grab the indent from the last line
    last_line = docstring.split("\n")[-1]
    # Is it empty except for the last triple-quotes (not-included in `docstring`)?
    indent_search = re.search(r"^(\s*)$", last_line)
    if indent_search is not None:
        indent = indent_search.groups()[0]
        if len(indent) > 0:
            docstring = docstring[: -len(indent)]
    # Or are the triple quotes next to text (we will fix that).
    else:
        indent_search = _re_indent.search(last_line)
        indent = indent_search.groups()[0] if indent_search is not None else ""

    # Add missing new lines before Args/Returns etc.
    docstring = _add_new_lines_before_doc_special_words(docstring)
    # Add missing new lines before lists
    docstring = _add_new_lines_before_list(docstring)
    # Style
    styled_doc = doc_styler.style(docstring, max_len=max_len, min_indent=indent)

    # Add new lines if necessary
    if not styled_doc.startswith("\n"):
        styled_doc = "\n" + styled_doc
    if not styled_doc.endswith("\n"):
        styled_doc += "\n"
    return styled_doc + indent


def style_file_docstrings(code_file, max_len=119, check_only=False):
    """Style all docstrings in `code_file` to `max_len`."""
    with open(code_file, "r", encoding="utf-8", newline="\n") as f:
        code = f.read()
    splits = code.split('"""')
    splits = [
        (s if i % 2 == 0 or _re_doc_ignore.search(splits[i - 1]) is not None else style_docstring(s, max_len=max_len))
        for i, s in enumerate(splits)
    ]
    clean_code = '"""'.join(splits)

    diff = clean_code != code
    if not check_only and diff:
        print(f"Overwriting content of {code_file}.")
        with open(code_file, "w", encoding="utf-8", newline="\n") as f:
            f.write(clean_code)

    return diff


def style_doc_files(*files, max_len=119, check_only=False):
    """
    Style all `files` to `max_len` and fixes mistakes if not `check_only`, otherwise raises an error if styling should
    be done.
    """
    changed = []
    for file in files:
        # Treat folders
        if os.path.isdir(file):
            files = [os.path.join(file, f) for f in os.listdir(file)]
            files = [f for f in files if os.path.isdir(f) or f.endswith(".rst") or f.endswith(".py")]
            changed += style_doc_files(*files, max_len=max_len, check_only=check_only)
        # Treat rst
        elif file.endswith(".rst"):
            if style_rst_file(file, max_len=max_len, check_only=check_only):
                changed.append(file)
        # Treat python files
        elif file.endswith(".py"):
            if style_file_docstrings(file, max_len=max_len, check_only=check_only):
                changed.append(file)
        else:
            warnings.warn(f"Ignoring {file} because it's not a py or an rst file or a folder.")
    return changed


def main(*files, max_len=119, check_only=False):
    changed = style_doc_files(*files, max_len=max_len, check_only=check_only)
    if check_only and len(changed) > 0:
        raise ValueError(f"{len(changed)} files should be restyled!")
    elif len(changed) > 0:
        print(f"Cleaned {len(changed)} files!")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("files", nargs="+", help="The file(s) or folder(s) to restyle.")
    parser.add_argument("--max_len", type=int, help="The maximum length of lines.")
    parser.add_argument("--check_only", action="store_true", help="Whether to only check and not fix styling issues.")
    args = parser.parse_args()

    main(*args.files, max_len=args.max_len, check_only=args.check_only)