-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmultiprocessing_test3.py
73 lines (61 loc) · 2.12 KB
/
multiprocessing_test3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#! /usr/bin/env python3
# -*- coding:utf-8 -*-
###############################################################
# © kenwaldek MIT-license
#
# Title: multiprocessing 3 Version: 1.0
# Date: 30-12-16 Language: python3
# Description: multiprocessing dus met meerdere cores te samen
# spider zoekt links in webpagina en dan links in die pagina
###############################################################
# het script runnen in een terminal om uit te voeren
from multiprocessing import Pool
import bs4 as bs
import random
import requests
import string
def random_starting_url():
starting = ''.join(random.SystemRandom().choice(string.ascii_lowercase) for _ in range(3))
url = ''.join(['http://', starting, '.com'])
return url
def handle_local_links(url,link):
if link.startswith('/'):
return ''.join([url,link])
else:
return link
def get_links(url):
try:
resp = requests.get(url)
soup = bs.BeautifulSoup(resp.text, 'lxml')
body = soup.body
links = [link.get('href') for link in body.find_all('a')]
links = [handle_local_links(url,link) for link in links]
links = [str(link.encode("ascii")) for link in links]
return links
except TypeError as e:
print(e)
print('Got a TypeError, probably got a None that we tried to iterate over')
return []
except IndexError as e:
print(e)
print('We probably did not find any useful links, returning empty list')
return []
except AttributeError as e:
print(e)
print('Likely got None for links, so we are throwing this')
return []
except Exception as e:
print(str(e))
# log this error
return []
def main():
how_many = 50
p = Pool(processes=how_many)
parse_us = [random_starting_url() for _ in range(how_many)]
data = p.map(get_links, [link for link in parse_us])
data = [url for url_list in data for url in url_list]
p.close()
with open('urls.txt', 'w') as f:
f.write(str(data))
if __name__ == '__main__':
main()