forked from REMitchell/python-scraping
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
1f4e5c1
commit 7db6393
Showing
5 changed files
with
77 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import scrapy | ||
|
||
class ArticleSpider(scrapy.Spider): | ||
name='article' | ||
|
||
def start_requests(self): | ||
urls = [ | ||
"http://en.wikipedia.org/wiki/Python_%28programming_language%29", | ||
"https://en.wikipedia.org/wiki/Functional_programming", | ||
"https://en.wikipedia.org/wiki/Monty_Python"] | ||
return [scrapy.Request(url=url, callback=self.parse) for url in urls] | ||
|
||
def parse(self, response): | ||
title = response.css('h1::text').extract_first() | ||
print('Title is: {}'.format(title)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from scrapy.contrib.linkextractors import LinkExtractor | ||
from scrapy.contrib.spiders import CrawlSpider, Rule | ||
from wikiSpider.items import Article | ||
|
||
class ArticleSpider(CrawlSpider): | ||
name = 'articleItems' | ||
allowed_domains = ['wikipedia.org'] | ||
start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life'] | ||
rules = [ | ||
Rule(LinkExtractor(allow='(/wiki/)((?!:).)*$'), callback='parse_items', follow=True), | ||
] | ||
|
||
def parse_items(self, response): | ||
article = Article() | ||
article['title'] = response.css('h1::text').extract_first() | ||
article['text'] = response.xpath('//div[@id="mw-content-text"]//text()').extract() | ||
lastUpdated = response.css('li#footer-info-lastmod::text').extract_first() | ||
article['lastUpdated'] = lastUpdated.replace('This page was last edited on ', '') | ||
return article |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,17 @@ | ||
import scrapy | ||
from scrapy.contrib.linkextractors import LinkExtractor | ||
from scrapy.contrib.spiders import CrawlSpider, Rule | ||
|
||
class ArticleSpider(scrapy.Spider): | ||
name='articles' | ||
class ArticleSpider(CrawlSpider): | ||
name = 'articles' | ||
allowed_domains = ['wikipedia.org'] | ||
start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life'] | ||
rules = [Rule(LinkExtractor(allow=r'.*'), callback='parse_items', follow=True)] | ||
|
||
def start_requests(self): | ||
urls = [ | ||
"http://en.wikipedia.org/wiki/Python_%28programming_language%29", | ||
"https://en.wikipedia.org/wiki/Functional_programming", | ||
"https://en.wikipedia.org/wiki/Monty_Python"] | ||
return [scrapy.Request(url=a, callback=self.parse) for a in urls] | ||
|
||
def parse(self, response): | ||
def parse_items(self, response): | ||
title = response.css('h1::text').extract_first() | ||
print('Title is: {}'.format(title)) | ||
text = response.xpath('//div[@id="mw-content-text"]//text()').extract() | ||
lastUpdated = response.css('li#footer-info-lastmod::text').extract_first() | ||
lastUpdated = lastUpdated.replace('This page was last edited on ', '') | ||
print('title is: {} '.format(title)) | ||
print('text is: {}'.format(text)) | ||
print('Last updated: {}'.format(lastUpdated)) |
25 changes: 25 additions & 0 deletions
25
v2/Chapter05_Scrapy/wikiSpider/wikiSpider/articlesMoreRules.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
from scrapy.contrib.linkextractors import LinkExtractor | ||
from scrapy.contrib.spiders import CrawlSpider, Rule | ||
|
||
class ArticleSpider(CrawlSpider): | ||
name = 'articles' | ||
allowed_domains = ['wikipedia.org'] | ||
start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life'] | ||
rules = [ | ||
Rule(LinkExtractor(allow='^(/wiki/)((?!:).)*$'), callback='parse_items', follow=True, cb_kwargs={'is_article': True}), | ||
Rule(LinkExtractor(allow='.*'), callback='parse_items', cb_kwargs={'is_article': False}) | ||
] | ||
|
||
def parse_items(self, response, is_article): | ||
print(response.url) | ||
title = response.css('h1::text').extract_first() | ||
if is_article: | ||
text = response.xpath('//div[@id="mw-content-text"]//text()').extract() | ||
lastUpdated = response.css('li#footer-info-lastmod::text').extract_first() | ||
lastUpdated = lastUpdated.replace('This page was last edited on ', '') | ||
print('Title is: {} '.format(title)) | ||
print('title is: {} '.format(title)) | ||
print('text is: {}'.format(text)) | ||
else: | ||
print('This is not an article: {}'.format(title)) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters