Skip to content

Commit

Permalink
request one page and iterpage page. friendlist page done
Browse files Browse the repository at this point in the history
  • Loading branch information
JackonYang committed Sep 16, 2013
1 parent 508a6e2 commit 2d6cd45
Showing 1 changed file with 54 additions and 9 deletions.
63 changes: 54 additions & 9 deletions client/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
import urllib
from httplib2 import Http
import os
import re

max_timeout = 5
resend_n = 3
headers_templates = {
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.65 Safari/534.24',
Expand All @@ -28,7 +31,7 @@ def __init__(self, user, password, autoLogin=True):
# print 'no need login'

def signin(self, user, password):
"""sigin to renren.com. return cookie if success."""
"""sigin to renren.com. return and save cookie if success."""
# TODO:
# 1. deal with timeout
# 2. random useragent
Expand Down Expand Up @@ -60,6 +63,11 @@ def signin(self, user, password):
f.write(content)
return None

def friendList(self, rid, max_pages=100):
url_ptn = "http://friend.renren.com/GetFriendList.do?curpage={}&id={}"
item_ptn = re.compile(r'<dd>\s*<a\s+href="http://www.renren.com/profile.do\?id=\d+">.*?</a>')
return self.__iter_page(url_ptn, item_ptn, rid, max_pages, resend_n)

def renrenId(self):
import re
proj = re.compile(r'\Wid=(\d+);')
Expand All @@ -70,7 +78,50 @@ def renrenId(self):
return None

def request(self, url, method='GET'):
return self.h.request(url, method, headers=self.headers)
"""request a page and return html content"""
headers = headers_templates.copy()
headers['Cookie'] = self.cookie
rsp, content = self.h.request(url, method, headers=headers)
with open('fl.html', 'w') as f:
f.write(content)
return content

def __iter_page(self, url_ptn, item_ptn, rid, pages, resend):
"""__iter_page(pageStyle, rid) --> items:set()"""
if resend < 0:
return None
if isinstance(pages, int):
pages = range(pages)

itemsAll = set()
timeout_seq = list()

# request next page until no more items detected
for curpage in pages:
try:
html_content = self.request(url_ptn.format(curpage, rid))
except:
timeout_seq.append(curpage)
# break when much more timeout than normal
if len(timeout_seq) > max_timeout:
return None # 'more timeout than max_timeout'
else:
items_curpage = item_ptn.findall(html_content) # detect items
if len(items_curpage) > 0:
itemsAll.update(items_curpage)
else: # privacy, all pages requested, or safety page
break
# deal with timeout_seq
if timeout_seq: # resend
item_re = self.__iter_page(url_ptn, item_ptn, rid, timeout_seq, resend - 1)
if item_re is None:
return None
else:
itemsAll.update(item_re)
#_safety page check, if itemsAll empty.
#if (len(itemsAll) ==0) and self._is_safety_page(html_content):
# return None, 'account forbidden by safety policy'
return itemsAll

def __get_cookie(self):
cookie = None
Expand All @@ -87,10 +138,4 @@ def __save_cookie(self, cookie):
if __name__ == '__main__':
from settings import account
rr = renren(account['email'], account['password'])
print rr.renrenId()
#url = "http://friend.renren.com/GetFriendList.do?curpage={}&id={}".format(0, rr.renrenId)
#print url
#rsp, content = rr.request(url)
#print rsp
#with open('friendlist.html', 'w') as f:
# f.write(content)
print len(rr.friendList(rr.renrenId()))

0 comments on commit 2d6cd45

Please sign in to comment.