forked from GoogleCloudPlatform/bigquery-oreilly-book
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_code.py
executable file
·106 lines (90 loc) · 3.62 KB
/
extract_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python3
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Recursively extracts the text from a Google Doc.
"""
from __future__ import print_function
from apiclient import discovery
from httplib2 import Http
from oauth2client import client
from oauth2client import file
from oauth2client import tools
import sys
SCOPES = 'https://www.googleapis.com/auth/documents.readonly'
DISCOVERY_DOC = 'https://docs.googleapis.com/$discovery/rest?version=v1'
def get_credentials():
"""Gets valid user credentials from storage.
If nothing has been stored, or if the stored credentials are invalid,
the OAuth 2.0 flow is completed to obtain the new credentials.
Returns:
Credentials, the obtained credential.
"""
store = file.Storage('token.json')
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets('credentials.json', SCOPES)
credentials = tools.run_flow(flow, store)
return credentials
def read_paragraph_element(element):
"""Returns the text in the given ParagraphElement.
Args:
element: a ParagraphElement from a Google Doc.
"""
text_run = element.get('textRun')
if not text_run:
return ''
if text_run.get('textStyle', {}).get('weightedFontFamily', {}).get('fontFamily','') == 'Consolas':
return text_run.get('content')
else:
return '' # 'TEXT:' + str(text_run.get('textStyle')) + ':' + text_run.get('content')
def read_strucutural_elements(elements):
"""Recurses through a list of Structural Elements to read a document's text where text may be
in nested elements.
Args:
elements: a list of Structural Elements.
"""
text = ''
for value in elements:
if 'paragraph' in value:
elements = value.get('paragraph').get('elements')
for elem in elements:
text += read_paragraph_element(elem)
elif 'table' in value:
# The text in table cells are in nested Structural Elements and tables may be
# nested.
table = value.get('table')
for row in table.get('tableRows'):
cells = row.get('tableCells')
for cell in cells:
text += read_strucutural_elements(cell.get('content'))
elif 'tableOfContents' in value:
# The text in the TOC is also in a Structural Element.
toc = value.get('tableOfContents')
text += read_strucutural_elements(toc.get('content'))
return text
def main(DOCUMENT_ID):
"""Uses the Docs API to print out the text of a document."""
credentials = get_credentials()
http = credentials.authorize(Http())
docs_service = discovery.build(
'docs', 'v1', http=http, discoveryServiceUrl=DISCOVERY_DOC)
doc = docs_service.documents().get(documentId=DOCUMENT_ID).execute()
doc_content = doc.get('body').get('content')
print(read_strucutural_elements(doc_content))
if __name__ == '__main__':
if len(sys.argv) < 2:
print("UsaGE: ./extract_code.py document_id")
else:
main(sys.argv[1])