This repository has been archived by the owner on Dec 17, 2022. It is now read-only.
forked from Showndarya/Hacktoberfest
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_scrapper.py
159 lines (122 loc) · 4.36 KB
/
web_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
"""
Script that generates word definition by the input data of words and
creates new files or updates existing definitions
Before you start
1. create new virtual environment with python3
2. pip install requests
3. pip install bs4
You are ready!
Just add your favourite words in the list at the end of the file and
Run script: 'python web_scrapper.py'
Before you commit check if the word definition is appropriate
Have fun :)
"""
import json
import os
import requests
import time
from bs4 import BeautifulSoup
OXFORD_URL = "https://en.oxforddictionaries.com/definition/"
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
PARTS_OF_SPEECH = [
"Noun",
"Pronoun",
"Verb",
"Adverb",
"Adjective",
"Preposition",
"Conjunction",
"Interjection",
]
def build_api_url(word):
return OXFORD_URL + word
def download_page(url):
try:
return requests.get(url).text
except Exception as e:
print("Error: Bad URL")
def parse_html(word, html_doc):
"""
Function parses page html and gets all definitions of given word for available parts of speech
"""
result = list()
found_pos = []
soup = BeautifulSoup(html_doc, 'html.parser')
sections = soup.findAll("section", {"class": "gramb"})
# print(sections, len(sections))
for section in sections:
full_definition = {
"word": None,
"definitions": list(),
"parts-of-speech": None
}
try:
part_of_speech = section.find('span').string.title()
except Exception as e:
print("No string argument ", word)
continue
if part_of_speech in PARTS_OF_SPEECH and part_of_speech not in found_pos:
full_definition["word"] = word.title()
full_definition["parts-of-speech"] = part_of_speech
found_pos.append(part_of_speech)
else:
if part_of_speech not in found_pos:
print("Skipping, unknown part-of-speech: ", part_of_speech)
else:
print("Word have more than one definition: ", word)
continue
def_parts = section.findAll("ul", {"class": "semb"})
for def_part in def_parts:
wdefs = def_part.findAll("span", {"class": "ind"})
for wdef in wdefs:
defs = full_definition.get('definitions')
defs.append(wdef.string)
full_definition['definitions'] = defs
if len(full_definition.get('definitions')) == 0:
print("No definition found: ", word)
continue
result.append(full_definition)
return result
def create_json_file(word, definition):
"""
Function creates new json file with word definition if doesn't exist
"""
first_letter = word[0]
subdir_path = os.path.join(ROOT_DIR, first_letter)
fname_path = os.path.join(subdir_path, word + ".json")
if not os.path.exists(fname_path):
with open(fname_path, 'w') as f:
print("Creating definition ", word + ".json")
json.dump(definition, f, indent=4)
else:
with open(fname_path, 'r') as f:
existing_def = json.load(f)
new_defs = definition.get("definitions")
file_defs = existing_def.get("definitions")
# check if definitions are not the same
for wdef in new_defs:
if wdef not in file_defs:
file_defs.append(wdef)
existing_def["definitions"] = file_defs
with open(fname_path, 'w') as file:
print("Updating definitions for ", word + ".json")
json.dump(existing_def, file, indent=4)
def generate(input_words):
"""
Function returns definitions of inputed words
"""
new_definitions = {}
print("Starting... ")
for word in input_words:
url = build_api_url(word)
content = download_page(url)
if content is not None:
for wdef in parse_html(word, content):
new_definitions[word.title() + "_" + wdef.get('parts-of-speech').lower()] = wdef
print("processed ", word)
time.sleep(3) # just to be not suspicious :)
for word, definition in new_definitions.items():
create_json_file(word, definition)
if __name__ == "__main__":
words = ['your', 'list', 'of', 'word']
generate(words)