-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract.py
158 lines (118 loc) · 4.5 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import re
import os
import zipfile
import xml.etree.ElementTree as ET
def is_newline(elem):
return elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pPr'
def is_codesnippet_end(attr):
return attr == 'Para' or attr == 'FeatureType'
def convert_title_to_directory_name(raw_header):
# Break title into component words
words_array = re.split(r'[\s|-]+', raw_header)
# Strip special characters
words_array = list(map(lambda x: ''.join(
e for e in x if e.isalnum()), words_array))
# Remove empty strings
while('' in words_array):
words_array.remove('')
# Capitalize each word
words_array = map(lambda x: x.capitalize()
if not x[0].istitle() else x, words_array)
return ''.join(words_array)
def get_directory_path_from_stack(stack):
return '/'.join(map(lambda x: x['title'], stack))
def get_full_file_path_from_stack(stack, file_extension):
path = get_directory_path_from_stack(stack)
top_of_stack = stack[-1]
# 0-padded number
next_file_number = str(top_of_stack['next_index'])
if len(next_file_number) == 1:
next_file_number = '0' + next_file_number
top_of_stack['next_index'] += 1
return f'''{path}/{top_of_stack['title']}Example{next_file_number}.{file_extension}'''
def guess_file_extension(buffer):
html_snippets = (
'<html>',
'<script',
'<div>',
'<span>',
'<p>',
)
for snippet in html_snippets:
if snippet in buffer:
return 'html'
return 'js'
DOCX_FILES = [f'''src/c{'0' if x < 10 else ''}{x}.docx''' for x in range(
1, 29)] + [f'src/appendix{x}.docx' for x in list('ABCD')]
for filename in DOCX_FILES:
xml_directory = filename.split('.')[0]
with zipfile.ZipFile(filename, 'r') as zip_ref:
zip_ref.extractall(xml_directory)
tree = ET.parse(f'{xml_directory}/word/document.xml')
root = tree.getroot()
header_text_buffer = ''
code_text_buffer = ''
inside_header_tag = False
inside_code_snippet = False
expected_stack_height = 0
stack = []
for elem in root.iter():
attr = None
try:
attr = elem.attrib['{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val']
except Exception as e:
pass
if attr == 'H1' or attr == 'H2' or attr == 'H3' or attr == 'H4' or attr == 'ChapterTitle' or attr == 'AppendixTitle':
inside_header_tag = True
inside_code_snippet = False
if attr == 'ChapterTitle' or attr == 'AppendixTitle':
expected_stack_height = 1
elif attr == 'H1':
expected_stack_height = 2
elif attr == 'H2':
expected_stack_height = 3
elif attr == 'H3':
expected_stack_height = 4
elif attr == 'H4':
expected_stack_height = 5
if attr == 'CodeSnippet':
inside_header_tag = False
inside_code_snippet = True
if inside_header_tag and is_newline(elem):
inside_header_tag = False
title = convert_title_to_directory_name(header_text_buffer)
if expected_stack_height == len(stack) + 1:
pass
elif expected_stack_height == len(stack):
stack.pop()
elif expected_stack_height < len(stack):
while expected_stack_height <= len(stack):
stack.pop()
else:
raise 'This should never happen!'
stack.append({
'title': title,
'next_index': 1
})
header_text_buffer = ''
code_text_buffer = ''
if inside_code_snippet and is_codesnippet_end(attr):
inside_code_snippet = False
try:
os.makedirs(get_directory_path_from_stack(stack))
except Exception as e:
pass
file_extension = guess_file_extension(code_text_buffer)
f = open(get_full_file_path_from_stack(stack, file_extension), "a")
f.write(code_text_buffer)
f.close()
header_text_buffer = ''
code_text_buffer = ''
if inside_header_tag:
if elem.text is not None:
header_text_buffer += elem.text
if inside_code_snippet:
if elem.text is not None:
code_text_buffer += elem.text
elif is_newline(elem):
code_text_buffer += '\n'