Update

Tespera · Jan 12, 2016 · 4370492 · 4370492
1 parent 326b85b
commit 4370492
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 14 deletions.
diff --git a/book_list-数学.xlsx b/book_list-数学.xlsx
diff --git a/doubanSpider.py b/doubanSpider.py
@@ -13,19 +13,21 @@
 sys.setdefaultencoding('utf8')
 
 
+
+#Some User Agents
+hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
+{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
+{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]
+
+
 def book_spider(book_tag):
     page_num=0;
-    count=1
     book_list=[]
     try_times=0
 
-    #Some User Agents
-    hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
-         {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
-         {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]
-
     while(1):
-        url="http://www.douban.com/tag/"+urllib.quote(book_tag)+"/book?start="+str(page_num*15)
+        #url='http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book?start=0' # For Test
+        url='http://www.douban.com/tag/"+urllib.quote(book_tag)+"/book?start='+str(page_num*15)
         time.sleep(np.random.rand()*2)
 
         #Last Version
@@ -54,32 +56,47 @@ def book_spider(book_tag):
             title = book_info.find('a', {'class':'title'}).string.strip()
             desc = book_info.find('div', {'class':'desc'}).string.strip()
             desc_list = desc.split('/')
+            book_url = book_info.find('a', {'class':'title'}).get('href')
 
             try:
                 author_info = '作者/译者： ' + '/'.join(desc_list[0:-3])
             except:
-                author_info='作者/译者： 暂无'
+                author_info ='作者/译者： 暂无'
             try:
                 pub_info = '出版信息： ' + '/'.join(desc_list[-3:])
             except:
-                pub_info='出版信息： 暂无'
+                pub_info = '出版信息： 暂无'
             try:
                 rating = book_info.find('span', {'class':'rating_nums'}).string.strip()
             except:
                 rating='0.0'
             try:
-                people_num = book_info.findAll('span')[2].string.strip()
-                people_num=people_num.strip('人评价')
+                #people_num = book_info.findAll('span')[2].string.strip()
+                people_num = get_people_num(book_url)
+                people_num = people_num.strip('人评价')
             except:
-                people_num='0'
+                people_num ='0'
 
             book_list.append([title,rating,people_num,author_info,pub_info])
             try_times=0 #set 0 when got valid information
         page_num+=1
-        print "Downloading Information From Page %d" % page_num
+        print 'Downloading Information From Page %d' % page_num
     return book_list
 
 
+def get_people_num(url):
+    #url='http://book.douban.com/subject/6082808/?from=tag_all' # For Test
+    try:
+        req = urllib2.Request(url, headers=hds[np.random.randint(0,len(hds))])
+        source_code = urllib2.urlopen(req).read()
+        plain_text=str(source_code)   
+    except (urllib2.HTTPError, urllib2.URLError), e:
+        print e
+    soup = BeautifulSoup(plain_text)
+    people_num=soup.find('div',{'class':'rating_sum'}).findAll('span')[1].string.strip()
+    return people_num
+
+
 def do_spider(book_tag_lists):
     book_lists=[]
     for book_tag in book_tag_lists:
@@ -112,7 +129,8 @@ def print_book_lists_excel(book_lists,book_tag_lists):
     #book_tag_lists = ['心理','判断与决策','算法','数据结构','经济','历史']
     #book_tag_lists = ['传记','哲学','编程','创业','理财','社会学','佛教']
     #book_tag_lists=['思想','科技','科学','web','股票','爱情','两性']
-    book_tag_lists=['计算机','机器学习','linux','android','数据库','互联网']
+    #book_tag_lists=['计算机','机器学习','linux','android','数据库','互联网']
+    book_tag_lists=['数学']
     book_lists=do_spider(book_tag_lists)
     print_book_lists_excel(book_lists,book_tag_lists)