forked from Te-k/harpoon
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbing.py
75 lines (71 loc) · 2.34 KB
/
bing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode
from dateutil.parser import parse
from harpoon.lib.utils import same_url
class Bing(object):
@staticmethod
def search(query):
"""
Search in bing
"""
# FIXME : change default UA
r = requests.get(
"https://www.bing.com/search",
params = {'q': query }
)
soup = BeautifulSoup(r.text, 'lxml')
res = []
divs = soup.find_all('li', class_='b_algo')
for d in divs:
data = {
'name': d.a.text,
'url': d.a['href'],
'text': d.p.text
}
attribution = d.find('div', class_='b_attribution')
# Check if cache infos in attribute
if 'u' in attribution.attrs:
b = attribution['u'].split('|')
data['cache'] = "http://cc.bingj.com/cache.aspx?d=%s&w=%s" % (
b[2],
b[3]
)
res.append(data)
return res
@staticmethod
def download_cache(url):
"""
Download cache data from a cached page
"""
r = requests.get(url)
if r.status_code == 200:
if "Could not find the requested document in the cache" in r.text:
# Bing bug
return {"success": False}
else:
soup = BeautifulSoup(r.text, 'lxml')
content = soup.find('div', class_='cacheContent')
data = r.text[r.text.find('<div class="cacheContent">')+26:len(r.text)-41]
return {
"success": True,
"date": parse(soup.find_all('strong')[1].text),
"data": str(content)[26:-40],
'url': soup.strong.a['href'],
'cacheurl': url
}
else:
if r.status_code != 404:
print('Weird, it should return 200 or 404')
return {"success": False}
@staticmethod
def cache(url):
"""
Search for an url in Bing cache
"""
res = Bing.search(url)
for i in res:
if same_url(url, i['url']):
if 'cache' in i:
return Bing.download_cache(i['cache'])
return {'success': False}