Skip to content

Commit

Permalink
Slybot add line feed character support to FeedGenerator
Browse files Browse the repository at this point in the history
  • Loading branch information
ruairif committed Jun 2, 2017
1 parent 6a18772 commit 3ba03cc
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 4 deletions.
2 changes: 1 addition & 1 deletion slybot/slybot/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
EXTENSIONS = {'slybot.closespider.SlybotCloseSpider': 1}
ITEM_PIPELINES = {
'slybot.dupefilter.DupeFilterPipeline': 1,
'slybot.meta.DropMetaPipleine': 2
'slybot.meta.DropMetaPipeline': 2
}
SPIDER_MIDDLEWARES = {'slybot.spiderlets.SpiderletsMiddleware': 999} # as close as possible to spider output
DOWNLOADER_MIDDLEWARES = {
Expand Down
6 changes: 4 additions & 2 deletions slybot/slybot/starturls/feed_generator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import re
from scrapy import Request
_NEWLINE_RE = re.compile('[\r\n]')


class FeedGenerator(object):
Expand All @@ -9,7 +11,7 @@ def __call__(self, url):
return Request(url, callback=self.parse_urls)

def parse_urls(self, response):
newline_urls = response.text.split('\n')
urls = [url.replace('\r', '') for url in newline_urls if url]
newline_urls = _NEWLINE_RE.split(response.text)
urls = [url for url in newline_urls if url]
for url in urls:
yield Request(url, callback=self.callback)
21 changes: 20 additions & 1 deletion slybot/slybot/tests/test_starturls.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from unittest import TestCase

from slybot.starturls import FragmentGenerator, IdentityGenerator, StartUrlCollection, UrlGenerator
from scrapy import Request
from scrapy.http.response.text import TextResponse
from slybot.starturls import (
FragmentGenerator, IdentityGenerator, StartUrlCollection, UrlGenerator,
FeedGenerator)


class StartUrlCollectionTest(TestCase):
Expand Down Expand Up @@ -478,5 +482,20 @@ def test_normalized_mixed(self):

self.assertEqual(list(collection.normalize()), normalized)

def test_feed_url(self):
url = 'http://example.com/feed'
feed = FeedGenerator(lambda: 0)
response = TextResponse(url, body=(
'http://example.com/1\r'
'http://example.com/2\r\n'
'http://example.com/3\n\r'
'http://example.com/4\n'))
self.assertEqual([r.url for r in feed.parse_urls(response)], [
'http://example.com/1',
'http://example.com/2',
'http://example.com/3',
'http://example.com/4',
])

def generator_set(generator, start_urls):
return set(list(generator()(start_urls)))

0 comments on commit 3ba03cc

Please sign in to comment.