forked from microsoft/autogen
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_browser_utils.py
174 lines (142 loc) · 5.95 KB
/
test_browser_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import pytest
import os
import sys
import requests
import hashlib
import re
import math
from agentchat.test_assistant_agent import KEY_LOC # noqa: E402
BLOG_POST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
BLOG_POST_TITLE = "Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen"
BLOG_POST_STRING = "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?"
WIKIPEDIA_URL = "https://en.wikipedia.org/wiki/Microsoft"
WIKIPEDIA_TITLE = "Microsoft - Wikipedia"
WIKIPEDIA_STRING = "Redmond"
PLAIN_TEXT_URL = "https://raw.githubusercontent.com/microsoft/autogen/main/README.md"
IMAGE_URL = "https://github.com/afourney.png"
PDF_URL = "https://arxiv.org/pdf/2308.08155.pdf"
PDF_STRING = "Figure 1: AutoGen enables diverse LLM-based applications using multi-agent conversations."
BING_QUERY = "Microsoft"
BING_TITLE = f"{BING_QUERY} - Search"
BING_STRING = f"A Bing search for '{BING_QUERY}' found"
try:
from autogen.browser_utils import SimpleTextBrowser
except ImportError:
skip_all = True
else:
skip_all = False
try:
BING_API_KEY = os.environ["BING_API_KEY"]
except KeyError:
skip_bing = True
else:
skip_bing = False
def _rm_folder(path):
"""Remove all the regular files in a folder, then deletes the folder. Assumes a flat file structure, with no subdirectories."""
for fname in os.listdir(path):
fpath = os.path.join(path, fname)
if os.path.isfile(fpath):
os.unlink(fpath)
os.rmdir(path)
@pytest.mark.skipif(
skip_all,
reason="do not run if dependency is not installed",
)
def test_simple_text_browser():
# Create a downloads folder (removing any leftover ones from prior tests)
downloads_folder = os.path.join(KEY_LOC, "downloads")
if os.path.isdir(downloads_folder):
_rm_folder(downloads_folder)
os.mkdir(downloads_folder)
# Instantiate the browser
user_agent = "python-requests/" + requests.__version__
viewport_size = 1024
browser = SimpleTextBrowser(
downloads_folder=downloads_folder,
viewport_size=viewport_size,
request_kwargs={
"headers": {"User-Agent": user_agent},
},
)
# Test that we can visit a page and find what we expect there
top_viewport = browser.visit_page(BLOG_POST_URL)
assert browser.viewport == top_viewport
assert browser.page_title.strip() == BLOG_POST_TITLE.strip()
assert BLOG_POST_STRING in browser.page_content
# Check if page splitting works
approx_pages = math.ceil(len(browser.page_content) / viewport_size) # May be fewer, since it aligns to word breaks
assert len(browser.viewport_pages) <= approx_pages
assert abs(len(browser.viewport_pages) - approx_pages) <= 1 # allow only a small deviation
assert browser.viewport_pages[0][0] == 0
assert browser.viewport_pages[-1][1] == len(browser.page_content)
# Make sure we can reconstruct the full contents from the split pages
buffer = ""
for bounds in browser.viewport_pages:
buffer += browser.page_content[bounds[0] : bounds[1]]
assert buffer == browser.page_content
# Test scrolling (scroll all the way to the bottom)
for i in range(1, len(browser.viewport_pages)):
browser.page_down()
assert browser.viewport_current_page == i
# Test scrolloing beyond the limits
for i in range(0, 5):
browser.page_down()
assert browser.viewport_current_page == len(browser.viewport_pages) - 1
# Test scrolling (scroll all the way to the bottom)
for i in range(len(browser.viewport_pages) - 2, 0, -1):
browser.page_up()
assert browser.viewport_current_page == i
# Test scrolloing beyond the limits
for i in range(0, 5):
browser.page_up()
assert browser.viewport_current_page == 0
# Test Wikipedia handling
assert WIKIPEDIA_STRING in browser.visit_page(WIKIPEDIA_URL)
assert WIKIPEDIA_TITLE.strip() == browser.page_title.strip()
# Visit a plain-text file
response = requests.get(PLAIN_TEXT_URL)
response.raise_for_status()
expected_results = response.text
browser.visit_page(PLAIN_TEXT_URL)
assert browser.page_content.strip() == expected_results.strip()
# Directly download an image, and compute its md5
response = requests.get(IMAGE_URL, stream=True)
response.raise_for_status()
expected_md5 = hashlib.md5(response.raw.read()).hexdigest()
# Visit an image causing it to be downloaded by the SimpleTextBrowser, then compute its md5
viewport = browser.visit_page(IMAGE_URL)
m = re.search(r"Downloaded '(.*?)' to '(.*?)'", viewport)
fetched_url = m.group(1)
download_loc = m.group(2)
assert fetched_url == IMAGE_URL
with open(download_loc, "rb") as fh:
downloaded_md5 = hashlib.md5(fh.read()).hexdigest()
# MD%s should match
assert expected_md5 == downloaded_md5
# Fetch a PDF
viewport = browser.visit_page(PDF_URL)
assert PDF_STRING in viewport
# Clean up
_rm_folder(downloads_folder)
@pytest.mark.skipif(
skip_bing,
reason="do not run bing tests if key is missing",
)
def test_bing_search():
# Instantiate the browser
user_agent = "python-requests/" + requests.__version__
browser = SimpleTextBrowser(
bing_api_key=BING_API_KEY,
viewport_size=1024,
request_kwargs={
"headers": {"User-Agent": user_agent},
},
)
assert BING_STRING in browser.visit_page("bing: " + BING_QUERY)
assert BING_TITLE == browser.page_title
assert len(browser.viewport_pages) == 1
assert browser.viewport_pages[0] == (0, len(browser.page_content))
if __name__ == "__main__":
"""Runs this file's tests from the command line."""
test_simple_text_browser()
test_bing_search()