forked from Significant-Gravitas/AutoGPT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
browse.py
134 lines (103 loc) · 3.89 KB
/
browse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import requests
from bs4 import BeautifulSoup
from config import Config
from llm_utils import create_chat_completion
cfg = Config()
def scrape_text(url):
response = requests.get(url)
# Check if the response contains an HTTP error
if response.status_code >= 400:
return "Error: HTTP " + str(response.status_code) + " error"
soup = BeautifulSoup(response.text, "html.parser")
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
def extract_hyperlinks(soup):
hyperlinks = []
for link in soup.find_all('a', href=True):
hyperlinks.append((link.text, link['href']))
return hyperlinks
def format_hyperlinks(hyperlinks):
formatted_links = []
for link_text, link_url in hyperlinks:
formatted_links.append(f"{link_text} ({link_url})")
return formatted_links
def scrape_links(url):
response = requests.get(url)
# Check if the response contains an HTTP error
if response.status_code >= 400:
return "error"
soup = BeautifulSoup(response.text, "html.parser")
for script in soup(["script", "style"]):
script.extract()
hyperlinks = extract_hyperlinks(soup)
return format_hyperlinks(hyperlinks)
def split_text(text, max_length=8192):
paragraphs = text.split("\n")
current_length = 0
current_chunk = []
for paragraph in paragraphs:
if current_length + len(paragraph) + 1 <= max_length:
current_chunk.append(paragraph)
current_length += len(paragraph) + 1
else:
yield "\n".join(current_chunk)
current_chunk = [paragraph]
current_length = len(paragraph) + 1
if current_chunk:
yield "\n".join(current_chunk)
def summarize_text(text, is_website=True):
if text == "":
return "Error: No text to summarize"
print("Text length: " + str(len(text)) + " characters")
summaries = []
chunks = list(split_text(text))
for i, chunk in enumerate(chunks):
print("Summarizing chunk " + str(i + 1) + " / " + str(len(chunks)))
if is_website:
messages = [
{
"role": "user",
"content": "Please summarize the following website text, do not describe the general website, but instead concisely extract the specific information this subpage contains.: " +
chunk},
]
else:
messages = [
{
"role": "user",
"content": "Please summarize the following text, focusing on extracting concise and specific information: " +
chunk},
]
summary = create_chat_completion(
model=cfg.fast_llm_model,
messages=messages,
max_tokens=300,
)
summaries.append(summary)
print("Summarized " + str(len(chunks)) + " chunks.")
combined_summary = "\n".join(summaries)
# Summarize the combined summary
if is_website:
messages = [
{
"role": "user",
"content": "Please summarize the following website text, do not describe the general website, but instead concisely extract the specific information this subpage contains.: " +
combined_summary},
]
else:
messages = [
{
"role": "user",
"content": "Please summarize the following text, focusing on extracting concise and specific infomation: " +
combined_summary},
]
final_summary = create_chat_completion(
model=cfg.fast_llm_model,
messages=messages,
max_tokens=300,
)
return final_summary