extract.py

import re
import os
import zipfile
import xml.etree.ElementTree as ET


def is_newline(elem):
    return elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pPr'


def is_codesnippet_end(attr):
    return attr == 'Para' or attr == 'FeatureType'


def convert_title_to_directory_name(raw_header):
    # Break title into component words
    words_array = re.split(r'[\s|-]+', raw_header)

    # Strip special characters
    words_array = list(map(lambda x: ''.join(
        e for e in x if e.isalnum()), words_array))

    # Remove empty strings
    while('' in words_array):
        words_array.remove('')

    # Capitalize each word
    words_array = map(lambda x: x.capitalize()
                      if not x[0].istitle() else x, words_array)

    return ''.join(words_array)


def get_directory_path_from_stack(stack):
    return '/'.join(map(lambda x: x['title'], stack))


def get_full_file_path_from_stack(stack, file_extension):
    path = get_directory_path_from_stack(stack)

    top_of_stack = stack[-1]

    # 0-padded number
    next_file_number = str(top_of_stack['next_index'])
    if len(next_file_number) == 1:
        next_file_number = '0' + next_file_number

    top_of_stack['next_index'] += 1

    return f'''{path}/{top_of_stack['title']}Example{next_file_number}.{file_extension}'''


def guess_file_extension(buffer):
    html_snippets = (
        '<html>',
        '<script',
        '<div>',
        '<span>',
        '<p>',
    )
    for snippet in html_snippets:
        if snippet in buffer:
            return 'html'

    return 'js'


DOCX_FILES = [f'''src/c{'0' if x < 10 else ''}{x}.docx''' for x in range(
    1, 29)] + [f'src/appendix{x}.docx' for x in list('ABCD')]

for filename in DOCX_FILES:
    xml_directory = filename.split('.')[0]

    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall(xml_directory)

    tree = ET.parse(f'{xml_directory}/word/document.xml')
    root = tree.getroot()

    header_text_buffer = ''
    code_text_buffer = ''
    inside_header_tag = False
    inside_code_snippet = False
    expected_stack_height = 0
    stack = []

    for elem in root.iter():
        attr = None
        try:
            attr = elem.attrib['{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val']
        except Exception as e:
            pass

        if attr == 'H1' or attr == 'H2' or attr == 'H3' or attr == 'H4' or attr == 'ChapterTitle' or attr == 'AppendixTitle':
            inside_header_tag = True
            inside_code_snippet = False
            if attr == 'ChapterTitle' or attr == 'AppendixTitle':
                expected_stack_height = 1
            elif attr == 'H1':
                expected_stack_height = 2
            elif attr == 'H2':
                expected_stack_height = 3
            elif attr == 'H3':
                expected_stack_height = 4
            elif attr == 'H4':
                expected_stack_height = 5

        if attr == 'CodeSnippet':
            inside_header_tag = False
            inside_code_snippet = True

        if inside_header_tag and is_newline(elem):
            inside_header_tag = False

            title = convert_title_to_directory_name(header_text_buffer)
            if expected_stack_height == len(stack) + 1:
                pass
            elif expected_stack_height == len(stack):
                stack.pop()
            elif expected_stack_height < len(stack):
                while expected_stack_height <= len(stack):
                    stack.pop()
            else:
                raise 'This should never happen!'

            stack.append({
                'title': title,
                'next_index': 1
            })

            header_text_buffer = ''
            code_text_buffer = ''

        if inside_code_snippet and is_codesnippet_end(attr):
            inside_code_snippet = False
            try:
                os.makedirs(get_directory_path_from_stack(stack))
            except Exception as e:
                pass

            file_extension = guess_file_extension(code_text_buffer)

            f = open(get_full_file_path_from_stack(stack, file_extension), "a")
            f.write(code_text_buffer)
            f.close()

            header_text_buffer = ''
            code_text_buffer = ''

        if inside_header_tag:
            if elem.text is not None:
                header_text_buffer += elem.text

        if inside_code_snippet:
            if elem.text is not None:
                code_text_buffer += elem.text
            elif is_newline(elem):
                code_text_buffer += '\n'