-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetw163.py
executable file
·60 lines (52 loc) · 1.77 KB
/
getw163.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python
# -*-coding:utf-8-*-
#import sys
import urllib.request
from pymongo import MongoClient
from bs4 import BeautifulSoup
import re
import time
class retrieve(object):
"This is a crawl"
def __init__(self,url):
self.url = url
def getlinks(self):
links = set()
html = urllib.request.urlopen(self.url,timeout=10)
bsObj = BeautifulSoup(html,"lxml",from_encoding='gb18030')
for url in bsObj.findAll("a",href=re.compile("^(" + self.url + ")\d+.+(html)$")):
if url.attrs['href'] is not None:
if url.attrs['href'] not in links:
links.add(url.attrs['href'])
return links
def executedb(self,link='',gettitle='',getcontent='',h=0,ins=False):
client = MongoClient('localhost',27017)
db = client.w163
db.authenticate("user","passwd")
w163 = db.w163
if ins:
data = { "hash":h,"date":time.ctime(),"url":link,"title":gettitle,"content":getcontent}
w163.insert(data)
def getcontent(self,link=''):
links = self.getlinks()
for link in links:
h = hash(link)
html = urllib.request.urlopen(link,timeout=10)
bsObj = BeautifulSoup(html,from_encoding='gb18030')
if bsObj.find(id="epContentLeft") is not None:
gettitle = bsObj.find(id="epContentLeft").find('h1').getText()
getcontent = bsObj.find(id="endText").getText()
self.executedb(link,gettitle,getcontent,h,ins=True)
class getw163link(object):
def __init__(self,url):
self.url = url
def getalllink(self):
links = []
html = urllib.request.urlopen(self.url,timeout=10)
bsObj = BeautifulSoup(html)
geta = bsObj.findAll('a',href=re.compile("^(http://)[a-zA-Z]+(.163.com/)$"))
for link in geta:
if link.attrs['href'] is not None:
if link.attrs['href'] not in links:
links.append(link.attrs['href']) #get all links like http://news.163.com
return links