Skip to content

Commit 8c9fe21

Browse files
committed
re-format
1 parent 12f641d commit 8c9fe21

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+3140
-2261
lines changed

learner/basic.py

-6
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,13 @@ def __init__(self, extensions: str = None, directory: str = None):
1616
self.extensions = extensions
1717
self.directory = directory
1818

19-
2019
def load_file_name(self):
2120
"""
2221
Load the documents from the given directory.
2322
:param directory: The directory to load from.
2423
:return: The list of loaded documents.
2524
"""
2625
return utils.find_files_with_extension(self.directory, self.extensions)
27-
2826

2927
def construct_document_list(self):
3028
"""
@@ -33,7 +31,3 @@ def construct_document_list(self):
3331
:return: The list of metadata for the loaded documents.
3432
"""
3533
pass
36-
37-
38-
39-

learner/indexer.py

+18-12
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
from langchain_community.embeddings import HuggingFaceEmbeddings
77
from langchain_community.vectorstores import FAISS
88
import os
9-
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
109

10+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
1111

1212

1313
def create_indexer(app: str, docs: str, format: str, incremental: bool, save_path: str):
@@ -31,35 +31,41 @@ def create_indexer(app: str, docs: str, format: str, incremental: bool, save_pat
3131
loader = xml_loader.XMLLoader(docs)
3232
documents = loader.construct_document()
3333

34-
print_with_color("Creating indexer for {num} documents for {app}...".format(num=len(documents), app=app), "yellow")
34+
print_with_color(
35+
"Creating indexer for {num} documents for {app}...".format(
36+
num=len(documents), app=app
37+
),
38+
"yellow",
39+
)
3540

3641
if format == "xml":
37-
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
42+
embeddings = HuggingFaceEmbeddings(
43+
model_name="sentence-transformers/all-mpnet-base-v2"
44+
)
3845
else:
3946
raise ValueError("Invalid format: " + format)
40-
47+
4148
db = FAISS.from_documents(documents, embeddings)
4249

4350
if incremental:
4451
if app in records:
4552
print_with_color("Merging with previous indexer...", "yellow")
4653
prev_db = FAISS.load_local(records[app], embeddings)
4754
db.merge_from(prev_db)
48-
55+
4956
db_file_path = os.path.join(save_path, app)
5057
db_file_path = os.path.abspath(db_file_path)
5158
db.save_local(db_file_path)
5259

5360
records[app] = db_file_path
5461

55-
5662
save_json_file("./learner/records.json", records)
5763

58-
print_with_color("Indexer for {app} created successfully. Save in {path}.".format(app=app, path=db_file_path), "green")
64+
print_with_color(
65+
"Indexer for {app} created successfully. Save in {path}.".format(
66+
app=app, path=db_file_path
67+
),
68+
"green",
69+
)
5970

6071
return db_file_path
61-
62-
63-
64-
65-

learner/learn.py

+27-13
Original file line numberDiff line numberDiff line change
@@ -6,28 +6,42 @@
66

77

88
args = argparse.ArgumentParser()
9-
args.add_argument("--app", help="The name of application to learn.",
10-
type=str, default="./")
11-
args.add_argument("--docs", help="The help application of the app.", type=str,
12-
default="./")
13-
args.add_argument("--format", help="The format of the help doc.", type=str,
14-
default="xml")
15-
args.add_argument('--incremental', action='store_true', help='Enable incremental update.')
16-
args.add_argument("--save_path", help="The format of the help doc.", type=str,
17-
default="./vectordb/docs/")
18-
19-
9+
args.add_argument(
10+
"--app", help="The name of application to learn.", type=str, default="./"
11+
)
12+
args.add_argument(
13+
"--docs", help="The help application of the app.", type=str, default="./"
14+
)
15+
args.add_argument(
16+
"--format", help="The format of the help doc.", type=str, default="xml"
17+
)
18+
args.add_argument(
19+
"--incremental", action="store_true", help="Enable incremental update."
20+
)
21+
args.add_argument(
22+
"--save_path",
23+
help="The format of the help doc.",
24+
type=str,
25+
default="./vectordb/docs/",
26+
)
2027

2128

2229
parsed_args = args.parse_args()
2330

31+
2432
def main():
2533
"""
2634
Main function.
2735
"""
2836

29-
indexer.create_indexer(parsed_args.app, parsed_args.docs, parsed_args.format, parsed_args.incremental, parsed_args.save_path)
37+
indexer.create_indexer(
38+
parsed_args.app,
39+
parsed_args.docs,
40+
parsed_args.format,
41+
parsed_args.incremental,
42+
parsed_args.save_path,
43+
)
3044

3145

3246
if __name__ == "__main__":
33-
main()
47+
main()

record_processor/parser/demonstration_record.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,21 @@
11
# Copyright (c) Microsoft Corporation.
22
# Licensed under the MIT License.
33

4+
45
class DemonstrationStep:
56
"""
67
Class for the single step information in the user demonstration record.
78
Multiple steps will be recorded to achieve a specific request.
89
"""
910

10-
def __init__(self, application: str, description: str, action: str, screenshot: str, comment: str):
11+
def __init__(
12+
self,
13+
application: str,
14+
description: str,
15+
action: str,
16+
screenshot: str,
17+
comment: str,
18+
):
1119
"""
1220
Create a new step.
1321
"""
@@ -17,6 +25,7 @@ def __init__(self, application: str, description: str, action: str, screenshot:
1725
self.comment = comment
1826
self.screenshot = screenshot
1927

28+
2029
class DemonstrationRecord:
2130
"""
2231
Class for the user demonstration record.
@@ -40,19 +49,19 @@ def set_request(self, request: str):
4049
Set the request.
4150
"""
4251
self.__request = request
43-
52+
4453
def get_request(self) -> str:
4554
"""
4655
Get the request.
4756
"""
4857
return self.__request
49-
58+
5059
def get_applications(self) -> list:
5160
"""
5261
Get the application.
5362
"""
5463
return self.__applications
55-
64+
5665
def get_step_num(self) -> int:
5766
"""
5867
Get the step number.

record_processor/parser/psr_record_parser.py

+55-35
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,11 @@ def parse_to_record(self) -> DemonstrationRecord:
3636
"""
3737
boundary = self.__find_boundary()
3838
self.parts_dict = self.__split_file_by_boundary(boundary)
39-
self.comments = self.__get_comments(
40-
self.parts_dict['main.htm']['Content'])
41-
self.steps = self.__get_steps(self.parts_dict['main.htm']['Content'])
39+
self.comments = self.__get_comments(self.parts_dict["main.htm"]["Content"])
40+
self.steps = self.__get_steps(self.parts_dict["main.htm"]["Content"])
4241
record = DemonstrationRecord(
43-
list(set(self.applications)), len(self.steps), **self.steps)
42+
list(set(self.applications)), len(self.steps), **self.steps
43+
)
4444

4545
return record
4646

@@ -54,14 +54,14 @@ def __find_boundary(self) -> str:
5454
if boundary_start != -1:
5555
boundary_start += len("boundary=")
5656
boundary_end = self.content.find("\n", boundary_start)
57-
boundary = self.content[boundary_start:boundary_end].strip('\"')
57+
boundary = self.content[boundary_start:boundary_end].strip('"')
5858
return boundary
5959
else:
6060
raise ValueError("Boundary not found in the .mht file.")
6161

6262
def __split_file_by_boundary(self, boundary: str) -> dict:
6363
"""
64-
Split the file by the boundary into parts,
64+
Split the file by the boundary into parts,
6565
Store the parts in a dictionary, including the content type,
6666
content location and content transfer encoding.
6767
boundary: The boundary of the file.
@@ -72,27 +72,36 @@ def __split_file_by_boundary(self, boundary: str) -> dict:
7272
for part in parts:
7373
content_type_start = part.find("Content-Type:")
7474
content_location_start = part.find("Content-Location:")
75-
content_transfer_encoding_start = part.find(
76-
"Content-Transfer-Encoding:")
75+
content_transfer_encoding_start = part.find("Content-Transfer-Encoding:")
7776
part_info = {}
7877
if content_location_start != -1:
7978
content_location_end = part.find("\n", content_location_start)
80-
content_location = part[content_location_start:content_location_end].split(":")[
81-
1].strip()
79+
content_location = (
80+
part[content_location_start:content_location_end]
81+
.split(":")[1]
82+
.strip()
83+
)
8284

8385
# add the content location
8486
if content_type_start != -1:
8587
content_type_end = part.find("\n", content_type_start)
86-
content_type = part[content_type_start:content_type_end].split(":")[
87-
1].strip()
88+
content_type = (
89+
part[content_type_start:content_type_end].split(":")[1].strip()
90+
)
8891
part_info["Content-Type"] = content_type
8992

9093
# add the content transfer encoding
9194
if content_transfer_encoding_start != -1:
9295
content_transfer_encoding_end = part.find(
93-
"\n", content_transfer_encoding_start)
94-
content_transfer_encoding = part[content_transfer_encoding_start:content_transfer_encoding_end].split(":")[
95-
1].strip()
96+
"\n", content_transfer_encoding_start
97+
)
98+
content_transfer_encoding = (
99+
part[
100+
content_transfer_encoding_start:content_transfer_encoding_end
101+
]
102+
.split(":")[1]
103+
.strip()
104+
)
96105
part_info["Content-Transfer-Encoding"] = content_transfer_encoding
97106

98107
content = part[content_location_end:].strip()
@@ -112,25 +121,30 @@ def __get_steps(self, content: str) -> dict:
112121
"""
113122

114123
user_action_data = re.search(
115-
r'<UserActionData>(.*?)</UserActionData>', content, re.DOTALL)
124+
r"<UserActionData>(.*?)</UserActionData>", content, re.DOTALL
125+
)
116126
if user_action_data:
117127

118128
root = ET.fromstring(user_action_data.group(1))
119129
steps = {}
120130

121-
for each_action in root.findall('EachAction'):
131+
for each_action in root.findall("EachAction"):
122132

123-
action_number = each_action.get('ActionNumber')
124-
application = each_action.get('FileName')
125-
description = each_action.find('Description').text
126-
action = each_action.find('Action').text
127-
screenshot_file_name = each_action.find(
128-
'ScreenshotFileName').text
133+
action_number = each_action.get("ActionNumber")
134+
application = each_action.get("FileName")
135+
description = each_action.find("Description").text
136+
action = each_action.find("Action").text
137+
screenshot_file_name = each_action.find("ScreenshotFileName").text
129138
screenshot = self.__get_screenshot(screenshot_file_name)
130139
step_key = f"step_{int(action_number) - 1}"
131140

132141
step = DemonstrationStep(
133-
application, description, action, screenshot, self.comments.get(step_key))
142+
application,
143+
description,
144+
action,
145+
screenshot,
146+
self.comments.get(step_key),
147+
)
134148
steps[step_key] = step
135149
self.applications.append(application)
136150
return steps
@@ -143,16 +157,21 @@ def __get_comments(self, content: str) -> dict:
143157
content: The content of the main.htm file.
144158
return: A dictionary of comments for each step.
145159
"""
146-
soup = BeautifulSoup(content, 'html.parser')
160+
soup = BeautifulSoup(content, "html.parser")
147161
body = soup.body
148-
steps_html = body.find('div', id='Steps')
149-
steps = steps_html.find_all(lambda tag: tag.name == 'div' and tag.has_attr(
150-
'id') and re.match(r'^Step\d+$', tag['id']))
162+
steps_html = body.find("div", id="Steps")
163+
steps = steps_html.find_all(
164+
lambda tag: tag.name == "div"
165+
and tag.has_attr("id")
166+
and re.match(r"^Step\d+$", tag["id"])
167+
)
151168

152169
comments = {}
153170
for index, step in enumerate(steps):
154-
comment_tag = step.find('b', text='Comment: ')
155-
comments[f'step_{index}'] = comment_tag.next_sibling if comment_tag else None
171+
comment_tag = step.find("b", text="Comment: ")
172+
comments[f"step_{index}"] = (
173+
comment_tag.next_sibling if comment_tag else None
174+
)
156175
return comments
157176

158177
def __get_screenshot(self, screenshot_file_name: str) -> str:
@@ -163,11 +182,12 @@ def __get_screenshot(self, screenshot_file_name: str) -> str:
163182
return: The screenshot in base64 string.
164183
"""
165184
screenshot_part = self.parts_dict[screenshot_file_name]
166-
content = screenshot_part['Content']
167-
content_type = screenshot_part['Content-Type']
168-
content_transfer_encoding = screenshot_part['Content-Transfer-Encoding']
185+
content = screenshot_part["Content"]
186+
content_type = screenshot_part["Content-Type"]
187+
content_transfer_encoding = screenshot_part["Content-Transfer-Encoding"]
169188

170-
screenshot = 'data:{type};{encoding}, {content}'.format(
171-
type=content_type, encoding=content_transfer_encoding, content=content)
189+
screenshot = "data:{type};{encoding}, {content}".format(
190+
type=content_type, encoding=content_transfer_encoding, content=content
191+
)
172192

173193
return screenshot

0 commit comments

Comments
 (0)