Skip to content

Commit

Permalink
add qingdaonews
Browse files Browse the repository at this point in the history
  • Loading branch information
Nyloner committed Dec 20, 2016
1 parent 4297e22 commit 584dbf7
Showing 1 changed file with 45 additions and 0 deletions.
45 changes: 45 additions & 0 deletions club.qingdaonews.com/article.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import requests
from bs4 import BeautifulSoup
import openpyxl

headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
'Cookie':'PHPSESSID=d2a521b9298f8691e4c37487b6657ac3; Hm_lvt_099a2f2a4f2c2f042dbd360b42309fc4=1482199772; Hm_lpvt_099a2f2a4f2c2f042dbd360b42309fc4=1482199852; CNZZDATA1000084976=1383072779-1482195841-null%7C1482195841; username=JarMrmn4olyPFzOAltjC0Q%3D%3D; password=jv2Y7Ga10EoO2Tn3W%2FY1plZvYz1QGqB2; NSC_dmvc=ffffffff09020e0445525d5f4f58455e445a4a423660',
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}


def get_article(endpage):
page=1
result=[]
while True:
url='http://club.qingdaonews.com/usercenter/mytopic.php?page=%s'%page
try:
html=requests.get(url,headers=headers,timeout=30).text
except:
continue
table=BeautifulSoup(html,'lxml').find('div',{'class':'add_list'}).find_all('li')
for li in table:
try:
url='http://club.qingdaonews.com'+li.find('a').get('href')
title=li.find('a').get_text()
result.append([title,url])
except:
continue
if page==endpage:
break
print(page,'ok')
page+=1
return result

def main():
result=get_article(168)
excel=openpyxl.Workbook(write_only=True)
sheet=excel.create_sheet()
for line in result:
sheet.append(line)
excel.save('urls.xlsx')

main()

0 comments on commit 584dbf7

Please sign in to comment.