Skip to content

Commit

Permalink
-v_1.1
Browse files Browse the repository at this point in the history
chenjiandongx committed Mar 27, 2017

Verified

This commit was signed with the committer’s verified signature.
npezza93 Nick Pezza
1 parent 98547ea commit 669744a
Showing 6 changed files with 44 additions and 16 deletions.
Binary file added images/word_cloud.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion stackoverflow/middleware/useragent.py
Original file line number Diff line number Diff line change
@@ -50,4 +50,4 @@ def process_request(self, request, spider):
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
]
8 changes: 4 additions & 4 deletions stackoverflow/spiders/stackoverflow_spider.py
Original file line number Diff line number Diff line change
@@ -9,21 +9,21 @@ class StackoverflowSpider(scrapy.Spider):
def start_requests(self):

urls = ['http://stackoverflow.com/questions?page={page}&sort=votes&pagesize=50'.format(page=page)
for page in range(11,21)]
for page in range(11, 21)]

for url in urls:
yield scrapy.Request(url=url, callback=self.parse)


def parse(self, response):

for index in range(1,51):
for index in range(1, 51):
sel = response.xpath('//*[@id="questions"]/div[{index}]'.format(index=index))
item = StackoverflowItem()
item['votes'] = sel.xpath('div[1]/div[2]/div[1]/div[1]/span/strong/text()').extract()
item['answers'] = sel.xpath('div[1]/div[2]/div[2]/strong/text()').extract()
item['views'] = "".join(sel.xpath('div[1]/div[3]/@title').extract()).split()[0].replace(",","")
item['views'] = "".join(sel.xpath('div[1]/div[3]/@title').extract()).split()[0].replace(",", "")
item['questions'] = sel.xpath('div[2]/h3/a/text()').extract()
item['links'] = "".join(sel.xpath('div[2]/h3/a/@href').extract()).split("/")[2]
item['tags']= sel.xpath('div[2]/div[2]/a/text()').extract()
item['tags'] = sel.xpath('div[2]/div[2]/a/text()').extract()
yield item
File renamed without changes.
21 changes: 10 additions & 11 deletions stackoverflow/mysql/sql.py → stackoverflow/utility/mysql.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import json
import pymysql

conn = pymysql.connect(host = "localhost",port = 3306,user = "root",passwd = "0303",db = "chenx",charset="utf8")
conn = pymysql.connect(host="localhost", port=3306, user="root", passwd="0303", db="chenx", charset="utf8")
cur = conn.cursor()

with open(r"e:\python\stackoverflow\data2.json","r",encoding="utf-8") as f:
with open(r"e:\python\stackoverflow\data\data2.json", "r", encoding="utf-8") as f:
data = json.load(f)

def insert_db(s_links, s_views, s_votes, s_answers, s_tags, s_questions):
@@ -15,17 +15,16 @@ def insert_db(s_links, s_views, s_votes, s_answers, s_tags, s_questions):
value = (s_links, s_views, s_votes, s_answers, s_tags, s_questions)
cur.execute(sql, value)
conn.commit()
print(s_links + " Done")
print("Insert s_links: " + s_links)

for i in range(len(data)):
s_links = data[i]['links']
s_views = data[i]['views']
s_votes = data[i]['votes']
s_answers = data[i]['answers']
s_tags = " ".join(data[i]['tags'])
s_questions = data[i]['questions']
for _, value in enumerate(data):
s_links = value['links']
s_views = value['views']
s_votes = value['votes']
s_answers = value['answers']
s_tags = " ".join(value['tags'])
s_questions = value['questions']
try:
insert_db(s_links, s_views, s_votes, s_answers, s_tags, s_questions)
except Exception as e :
except Exception as e:
print(e)
29 changes: 29 additions & 0 deletions stackoverflow/utility/word_cloud.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from collections import Counter
import json
from wordcloud import WordCloud
import matplotlib.pyplot as plt

tags = []
with open(r"e:\python\stackoverflow\data\data1.json", "r", encoding="utf-8") as f1:
data1 = json.load(f1)
for _, value in enumerate(data1):
tags.extend(value['tags'])


with open(r"e:\python\stackoverflow\data\data2.json", "r", encoding="utf-8") as f2:
data2 = json.load(f2)
for _, value in enumerate(data2):
tags.extend(value['tags'])

counter = Counter(tags)
counter_most = counter.most_common(200)
# print(counter_most)

wordcloud = WordCloud(font_path=r"e:\Font\msyh.ttf",
width=1200,
height=600).generate_from_frequencies(dict(counter_most))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

wordcloud.to_file(r'e:\python\stackoverflow\images\word_cloud.jpg')

0 comments on commit 669744a

Please sign in to comment.