forked from ethen8181/machine-learning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconverter.py
271 lines (224 loc) · 9.48 KB
/
converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
"""
Convert existing jupyter notebook to an airbnb knowledge repo format
"""
import os
import re
import json
import subprocess
from dateutil import parser as date_parser
def main(ml_repo, knowledge_repo, inplace):
ml_repo_path = os.path.abspath(ml_repo)
knowledge_repo_path = os.path.abspath(knowledge_repo)
if not os.path.isdir(knowledge_repo_path):
init_knowledge_repo(knowledge_repo_path)
convert_all_posts(ml_repo_path, knowledge_repo_path, inplace)
def init_knowledge_repo(path):
cmd = 'knowledge_repo --repo {} init'.format(path)
subprocess.call(cmd, shell = True)
def convert_all_posts(path, knowledge_repo_path, inplace):
"""Recursive walk down all directory to perform the conversion"""
if os.path.isdir(path):
files = [os.path.join(path, f) for f in os.listdir(path)]
for f in files:
convert_all_posts(f, knowledge_repo_path, inplace)
elif '-converted' not in path:
head, ext = os.path.splitext(path)
if ext == ".ipynb":
try:
converter = IpynbConverter(knowledge_repo_path, inplace)
notebook = converter.convert(path)
converter.add(notebook)
except Exception as e:
print('Skipping: {}'.format(path))
print(e)
class IpynbConverter:
"""
Converts Jupyter notebook to airbnb knowledge repo format [1]_.
Parameters
----------
knowledge_repo_path : str
Path to store the airbnb knowledge repo-ed notebook.
inplace : bool
Whether to perform the conversion inplace or not. If
false, then it will create a new notebook that has the
'-converted' appended to the file name.
Attributes
----------
date_created_ : str
Input notebook's creation date.
date_updated_ : str
Input notebook's latest updated date.
tags_ : str
The notebook's filename is use as the tag in this automated
conversion process. e.g. /Users/ethen/machine-learning/trees/decision_tree.ipynb,
we would use 'decision_tree' as the tag.
github_link_ : str
Notebook's original link on github.
title_ : str
Notebook's title, uses the first level 1 markdown header that's not
'Table of Contents' that could be automatically generated by newer
version of notebook. e.g. # Decision Tree (Classification)\n, then
Decision Tree (Classification) would be our title.
References
----------
.. [1] `Airbnb knowledge repo
<https://github.com/airbnb/knowledge-repo>`_
"""
AUTHOR = 'Ethen Liu'
DATE_FORMAT = '%Y-%m-%d'
REPO_NAME = 'machine-learning'
BASE_URL = 'https://github.com/ethen8181/'
def __init__(self, knowledge_repo_path, inplace):
self.inplace = inplace
self.knowledge_repo_path = knowledge_repo_path
def convert(self, path):
"""
Convert the input path's notebook to a knowledge repo. This
will add a mandatory raw cell that contains the yaml information
needed by the knowledge repo and an additional cell that contains
link to the notebook on github.
Parameters
----------
path : str
Path that has the '.ipynb' extension.
Returns
-------
notebook : dict
Updated Jupyter notebook's raw json represented in dictionary format.
Ready to be passed to the .add method to add to the knowledge repo.
"""
self.date_created_ = self._date_created(path)
self.date_updated_ = self._date_updated(path)
self.tags_, self.github_link_ = self._tags_and_github_link(path)
with open(path, encoding = 'utf-8') as f:
notebook = json.load(f)
self.title_ = self._title(notebook)
# prepend the dictionary header to notebook['cells']
notebook['cells'] = ([self._construct_header()] +
[self._construct_github_link_cell()] +
notebook['cells'])
if not self.inplace:
head, ext = os.path.splitext(path)
head += '-converted'
path = head + ext
self._path = path
return notebook
def _date_created(self, path):
"""Grab the date of creation through git log."""
cmd = 'git log --diff-filter=A --follow --format=%cd -1 -- {}'.format(path)
return self._git_date_cmd(cmd)
def _date_updated(self, path):
"""Grab the last date modified through git log."""
cmd = 'git log --format=%cd -1 -- {}'.format(path)
return self._git_date_cmd(cmd)
def _git_date_cmd(self, cmd):
"""Run bash command to retrieve and format date string."""
date_str = subprocess.check_output(cmd, shell = True)
date_dt = date_parser.parse(date_str)
formatted_date = date_dt.strftime(self.DATE_FORMAT)
return formatted_date
def _tags_and_github_link(self, path):
"""
Use file name as tags, e.g. /Users/ethen/machine-learning/trees/decision_tree.ipynb
we would use 'decision_tree' as the tag
"""
_, file_path = path.split(self.REPO_NAME)
_, file_name = os.path.split(file_path)
tags, _ = os.path.splitext(file_name)
# /blob/master indicates github master branch
link = self.BASE_URL + self.REPO_NAME + '/blob/master' + file_path
return tags, link
def _title(self, notebook):
"""
A title in the notebook always starts with the '#' indicating a
markdown level 1 header e.g. # Decision Tree (Classification)\n
thus we can just parse all the text in between the '#' and the line break '\n'
"""
# TODO : we could fall back to the file path if it doesn't exist perhaps?
title_pattern = re.compile('# (.*)\n')
for cell in notebook['cells']:
if cell['cell_type'] == 'markdown':
# the [0] indicates the # title pattern
# should always appear in the first line
source = cell['source'][0]
matched = title_pattern.match(source)
if matched is not None:
title = matched.group(1)
# newer version of notebooks includes a
# Table of Contents automatically in the first
# cell, skip that and find the next level 1 header
if not title == 'Table of Contents':
break
return title
def _construct_header(self):
"""Create a knowledge repo style header as a dictionary."""
def flatten_list(l):
"""
Although not needed for the current version, we could
have multiple tags and authors, in that case we would
need to flatten them out.
"""
flat = []
for item in l:
if isinstance(item, list):
flat += item
else:
flat.append(item)
return flat
header = {'cell_type': 'raw', 'metadata': {}}
# header text required by the knowledge repo
# a '- ' in front is required for knowledge repo tag
header_text = [
'---',
'title: {}'.format(self.title_),
'authors:',
'- {}'.format(self.AUTHOR),
'tags:',
'- ' + self.tags_,
'created_at: {}'.format(self.date_created_),
'updated_at: {}'.format(self.date_updated_),
'tldr: Nothing for tldr section as of now.',
'---']
header_text = flatten_list(header_text)
header_text = [text + '\n' for text in header_text[:-1]] + [header_text[-1]]
header['source'] = header_text
return header
def _construct_github_link_cell(self):
"""Add a cell that contains link to original notebook on github"""
github_link_cell = {
'cell_type': 'markdown',
'metadata': {},
'source': ['Link to original notebook: {}'.format(self.github_link_)]}
return github_link_cell
def add(self, notebook):
"""
Add the converted notebook to the knowledge repo.
Parameters
----------
notebook : dict
Jupyter notebook's raw json represented in dictionary format.
"""
with open(self._path, 'w', encoding = 'utf-8') as f:
json.dump(notebook, f)
# create a run knowledge repo command
destination = os.path.join(self.knowledge_repo_path, 'project', self.tags_)
cmd = 'knowledge_repo --repo {} add {} -p {}'.format(
self.knowledge_repo_path, self._path, destination)
p = subprocess.Popen(
cmd, stdout = subprocess.PIPE, stdin = subprocess.PIPE,
stderr = subprocess.STDOUT, shell = True)
p.communicate(input = b'generated by automated airbnb knowledge repo setup')
if not self.inplace:
os.remove(self._path)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
description = 'Convert the machine-learning repository to an Airbnb Knowledge Repo.')
parser.add_argument(
'--ml_repo', type = str, help = 'Path to the root directory of the machine-learning repo.')
parser.add_argument(
'--knowledge_repo', type = str, help = 'Path to the knowledge repo.')
parser.add_argument(
'--inplace', action = 'store_true', help = 'Modify the existing .ipynb in place.')
args = vars(parser.parse_args())
main(**args)