diff --git "a/book_list-\346\225\260\345\255\246.xlsx" "b/book_list-\346\225\260\345\255\246.xlsx" new file mode 100644 index 0000000..3505dc3 Binary files /dev/null and "b/book_list-\346\225\260\345\255\246.xlsx" differ diff --git a/doubanSpider.py b/doubanSpider.py index 5d7e88a..5b578ab 100644 --- a/doubanSpider.py +++ b/doubanSpider.py @@ -13,19 +13,21 @@ sys.setdefaultencoding('utf8') + +#Some User Agents +hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\ +{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\ +{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}] + + def book_spider(book_tag): page_num=0; - count=1 book_list=[] try_times=0 - #Some User Agents - hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\ - {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\ - {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}] - while(1): - url="http://www.douban.com/tag/"+urllib.quote(book_tag)+"/book?start="+str(page_num*15) + #url='http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book?start=0' # For Test + url='http://www.douban.com/tag/"+urllib.quote(book_tag)+"/book?start='+str(page_num*15) time.sleep(np.random.rand()*2) #Last Version @@ -54,32 +56,47 @@ def book_spider(book_tag): title = book_info.find('a', {'class':'title'}).string.strip() desc = book_info.find('div', {'class':'desc'}).string.strip() desc_list = desc.split('/') + book_url = book_info.find('a', {'class':'title'}).get('href') try: author_info = '作者/译者: ' + '/'.join(desc_list[0:-3]) except: - author_info='作者/译者: 暂无' + author_info ='作者/译者: 暂无' try: pub_info = '出版信息: ' + '/'.join(desc_list[-3:]) except: - pub_info='出版信息: 暂无' + pub_info = '出版信息: 暂无' try: rating = book_info.find('span', {'class':'rating_nums'}).string.strip() except: rating='0.0' try: - people_num = book_info.findAll('span')[2].string.strip() - people_num=people_num.strip('人评价') + #people_num = book_info.findAll('span')[2].string.strip() + people_num = get_people_num(book_url) + people_num = people_num.strip('人评价') except: - people_num='0' + people_num ='0' book_list.append([title,rating,people_num,author_info,pub_info]) try_times=0 #set 0 when got valid information page_num+=1 - print "Downloading Information From Page %d" % page_num + print 'Downloading Information From Page %d' % page_num return book_list +def get_people_num(url): + #url='http://book.douban.com/subject/6082808/?from=tag_all' # For Test + try: + req = urllib2.Request(url, headers=hds[np.random.randint(0,len(hds))]) + source_code = urllib2.urlopen(req).read() + plain_text=str(source_code) + except (urllib2.HTTPError, urllib2.URLError), e: + print e + soup = BeautifulSoup(plain_text) + people_num=soup.find('div',{'class':'rating_sum'}).findAll('span')[1].string.strip() + return people_num + + def do_spider(book_tag_lists): book_lists=[] for book_tag in book_tag_lists: @@ -112,7 +129,8 @@ def print_book_lists_excel(book_lists,book_tag_lists): #book_tag_lists = ['心理','判断与决策','算法','数据结构','经济','历史'] #book_tag_lists = ['传记','哲学','编程','创业','理财','社会学','佛教'] #book_tag_lists=['思想','科技','科学','web','股票','爱情','两性'] - book_tag_lists=['计算机','机器学习','linux','android','数据库','互联网'] + #book_tag_lists=['计算机','机器学习','linux','android','数据库','互联网'] + book_tag_lists=['数学'] book_lists=do_spider(book_tag_lists) print_book_lists_excel(book_lists,book_tag_lists)