-v_1.1

zzlzero · Mar 27, 2017 · 669744a · 669744a
1 parent 98547ea
commit 669744a
Showing 6 changed files with 44 additions and 16 deletions.
diff --git a/images/word_cloud.jpg b/images/word_cloud.jpg
diff --git a/stackoverflow/middleware/useragent.py b/stackoverflow/middleware/useragent.py
@@ -50,4 +50,4 @@ def process_request(self, request, spider):
         "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
         "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
-    ]
+    ]
diff --git a/stackoverflow/spiders/stackoverflow_spider.py b/stackoverflow/spiders/stackoverflow_spider.py
@@ -9,21 +9,21 @@ class StackoverflowSpider(scrapy.Spider):
     def start_requests(self):
 
         urls = ['http://stackoverflow.com/questions?page={page}&sort=votes&pagesize=50'.format(page=page)
-                for page in range(11,21)]
+                for page in range(11, 21)]
 
         for url in urls:
             yield scrapy.Request(url=url, callback=self.parse)
 
 
     def parse(self, response):
 
-        for index in range(1,51):
+        for index in range(1, 51):
             sel = response.xpath('//*[@id="questions"]/div[{index}]'.format(index=index))
             item = StackoverflowItem()
             item['votes'] = sel.xpath('div[1]/div[2]/div[1]/div[1]/span/strong/text()').extract()
             item['answers'] = sel.xpath('div[1]/div[2]/div[2]/strong/text()').extract()
-            item['views'] = "".join(sel.xpath('div[1]/div[3]/@title').extract()).split()[0].replace(",","")
+            item['views'] = "".join(sel.xpath('div[1]/div[3]/@title').extract()).split()[0].replace(",", "")
             item['questions'] = sel.xpath('div[2]/h3/a/text()').extract()
             item['links'] = "".join(sel.xpath('div[2]/h3/a/@href').extract()).split("/")[2]
-            item['tags']= sel.xpath('div[2]/div[2]/a/text()').extract()
+            item['tags'] = sel.xpath('div[2]/div[2]/a/text()').extract()
             yield item
diff --git a/stackoverflow/mysql/__init__.py → stackoverflow/utility/__init__.py b/stackoverflow/mysql/__init__.py → stackoverflow/utility/__init__.py
diff --git a/stackoverflow/mysql/sql.py → stackoverflow/utility/mysql.py b/stackoverflow/mysql/sql.py → stackoverflow/utility/mysql.py
@@ -1,10 +1,10 @@
 import json
 import pymysql
 
-conn = pymysql.connect(host = "localhost",port = 3306,user = "root",passwd = "0303",db = "chenx",charset="utf8")
+conn = pymysql.connect(host="localhost", port=3306, user="root", passwd="0303", db="chenx", charset="utf8")
 cur = conn.cursor()
 
-with open(r"e:\python\stackoverflow\data2.json","r",encoding="utf-8") as f:
+with open(r"e:\python\stackoverflow\data\data2.json", "r", encoding="utf-8") as f:
     data = json.load(f)
 
     def insert_db(s_links, s_views, s_votes, s_answers, s_tags, s_questions):
@@ -15,17 +15,16 @@ def insert_db(s_links, s_views, s_votes, s_answers, s_tags, s_questions):
         value = (s_links, s_views, s_votes, s_answers, s_tags, s_questions)
         cur.execute(sql, value)
         conn.commit()
-        print(s_links + " Done")
         print("Insert s_links: " + s_links)
 
-    for i in range(len(data)):
-        s_links = data[i]['links']
-        s_views = data[i]['views']
-        s_votes = data[i]['votes']
-        s_answers = data[i]['answers']
-        s_tags = " ".join(data[i]['tags'])
-        s_questions = data[i]['questions']
+    for _, value in enumerate(data):
+        s_links = value['links']
+        s_views = value['views']
+        s_votes = value['votes']
+        s_answers = value['answers']
+        s_tags = " ".join(value['tags'])
+        s_questions = value['questions']
         try:
             insert_db(s_links, s_views, s_votes, s_answers, s_tags, s_questions)
-        except Exception as e :
+        except Exception as e:
             print(e)
diff --git a/stackoverflow/utility/word_cloud.py b/stackoverflow/utility/word_cloud.py
@@ -0,0 +1,29 @@
+from collections import Counter
+import json
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+
+tags = []
+with open(r"e:\python\stackoverflow\data\data1.json", "r", encoding="utf-8") as f1:
+    data1 = json.load(f1)
+    for _, value in enumerate(data1):
+        tags.extend(value['tags'])
+
+
+with open(r"e:\python\stackoverflow\data\data2.json", "r", encoding="utf-8") as f2:
+    data2 = json.load(f2)
+    for _, value in enumerate(data2):
+        tags.extend(value['tags'])
+
+counter = Counter(tags)
+counter_most = counter.most_common(200)
+# print(counter_most)
+
+wordcloud = WordCloud(font_path=r"e:\Font\msyh.ttf",
+                      width=1200,
+                      height=600).generate_from_frequencies(dict(counter_most))
+plt.imshow(wordcloud)
+plt.axis('off')
+plt.show()
+
+wordcloud.to_file(r'e:\python\stackoverflow\images\word_cloud.jpg')