Skip to content

Commit

Permalink
More Scrapy stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
REMitchell committed Jan 22, 2018
1 parent 1f4e5c1 commit 7db6393
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 15 deletions.
15 changes: 15 additions & 0 deletions v2/Chapter05_Scrapy/wikiSpider/wikiSpider/article.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import scrapy

class ArticleSpider(scrapy.Spider):
name='article'

def start_requests(self):
urls = [
"http://en.wikipedia.org/wiki/Python_%28programming_language%29",
"https://en.wikipedia.org/wiki/Functional_programming",
"https://en.wikipedia.org/wiki/Monty_Python"]
return [scrapy.Request(url=url, callback=self.parse) for url in urls]

def parse(self, response):
title = response.css('h1::text').extract_first()
print('Title is: {}'.format(title))
19 changes: 19 additions & 0 deletions v2/Chapter05_Scrapy/wikiSpider/wikiSpider/articleItems.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from wikiSpider.items import Article

class ArticleSpider(CrawlSpider):
name = 'articleItems'
allowed_domains = ['wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
rules = [
Rule(LinkExtractor(allow='(/wiki/)((?!:).)*$'), callback='parse_items', follow=True),
]

def parse_items(self, response):
article = Article()
article['title'] = response.css('h1::text').extract_first()
article['text'] = response.xpath('//div[@id="mw-content-text"]//text()').extract()
lastUpdated = response.css('li#footer-info-lastmod::text').extract_first()
article['lastUpdated'] = lastUpdated.replace('This page was last edited on ', '')
return article
26 changes: 14 additions & 12 deletions v2/Chapter05_Scrapy/wikiSpider/wikiSpider/articles.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import scrapy
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule

class ArticleSpider(scrapy.Spider):
name='articles'
class ArticleSpider(CrawlSpider):
name = 'articles'
allowed_domains = ['wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
rules = [Rule(LinkExtractor(allow=r'.*'), callback='parse_items', follow=True)]

def start_requests(self):
urls = [
"http://en.wikipedia.org/wiki/Python_%28programming_language%29",
"https://en.wikipedia.org/wiki/Functional_programming",
"https://en.wikipedia.org/wiki/Monty_Python"]
return [scrapy.Request(url=a, callback=self.parse) for a in urls]

def parse(self, response):
def parse_items(self, response):
title = response.css('h1::text').extract_first()
print('Title is: {}'.format(title))
text = response.xpath('//div[@id="mw-content-text"]//text()').extract()
lastUpdated = response.css('li#footer-info-lastmod::text').extract_first()
lastUpdated = lastUpdated.replace('This page was last edited on ', '')
print('title is: {} '.format(title))
print('text is: {}'.format(text))
print('Last updated: {}'.format(lastUpdated))
25 changes: 25 additions & 0 deletions v2/Chapter05_Scrapy/wikiSpider/wikiSpider/articlesMoreRules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule

class ArticleSpider(CrawlSpider):
name = 'articles'
allowed_domains = ['wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
rules = [
Rule(LinkExtractor(allow='^(/wiki/)((?!:).)*$'), callback='parse_items', follow=True, cb_kwargs={'is_article': True}),
Rule(LinkExtractor(allow='.*'), callback='parse_items', cb_kwargs={'is_article': False})
]

def parse_items(self, response, is_article):
print(response.url)
title = response.css('h1::text').extract_first()
if is_article:
text = response.xpath('//div[@id="mw-content-text"]//text()').extract()
lastUpdated = response.css('li#footer-info-lastmod::text').extract_first()
lastUpdated = lastUpdated.replace('This page was last edited on ', '')
print('Title is: {} '.format(title))
print('title is: {} '.format(title))
print('text is: {}'.format(text))
else:
print('This is not an article: {}'.format(title))

7 changes: 4 additions & 3 deletions v2/Chapter05_Scrapy/wikiSpider/wikiSpider/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
import scrapy


class WikispiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
class Article(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
lastUpdated = scrapy.Field()
pass

0 comments on commit 7db6393

Please sign in to comment.