-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb-crawl.py
44 lines (35 loc) · 997 Bytes
/
web-crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from json import *
from urllib2 import *
get = urlopen('http://www.cnn.com/2014/11/21/justice/newtown-shooter-adam-lanza-report/index.html?hpt=hp_t1')
wordlist = []
dictionary = {}
#turns get into a string
text = get.read()
def read_cnn_doc(text):
text.split
"".join(text)
div = int(text.find("<strong>(CNN)</strong>"))
end_div = int(text.rfind("endclickprint"))
text = str(text)
return text[div:end_div]
def remove_script(text):
index = text.find("<script")
#remove script portions
while index is not -1:
end_index = text.find("</script>")
text = text[:index] + text[end_index+1:]
index = text.find("<script")
#purify!
text = text.replace("<p class="," ")
text = text.replace("</p>"," ")
text = text.replace("<"," ")
text = text.replace(">"," ")
text = text.replace("<!--"," ")
text = text.replace("-->"," ")
text = text.replace("/script"," ")
text = text.replace("/"," ")
return text
# Run Functions
mod = read_cnn_doc(text)
mod = remove_script(mod)
print(mod)