Skip to content

Commit 73109bf

Browse files
committed
Fixed SgmlLinkExtractor constructor to properly handle both string and list parameters (attrs, tags, deny_extensions)
1 parent f687455 commit 73109bf

File tree

3 files changed

+74
-7
lines changed

3 files changed

+74
-7
lines changed

docs/topics/link-extractors.rst

+4-3
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,9 @@ SgmlLinkExtractor
6969
domains which won't be considered for extracting the links
7070
:type deny_domains: str or list
7171

72-
:param deny_extensions: a list of extensions that should be ignored when
73-
extracting links. If not given, it will default to the
72+
:param deny_extensions: a single value or list of strings containing
73+
extensions that should be ignored when extracting links.
74+
If not given, it will default to the
7475
``IGNORED_EXTENSIONS`` list defined in the `scrapy.linkextractor`_
7576
module.
7677
:type deny_extensions: list
@@ -85,7 +86,7 @@ SgmlLinkExtractor
8586
Defaults to ``('a', 'area')``.
8687
:type tags: str or list
8788

88-
:param attrs: list of attributes which should be considered when looking
89+
:param attrs: an attribute or list of attributes which should be considered when looking
8990
for links to extract (only for those tags specified in the ``tags``
9091
parameter). Defaults to ``('href',)``
9192
:type attrs: list

scrapy/contrib/linkextractors/sgml.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def matches(self, url):
9595
class SgmlLinkExtractor(BaseSgmlLinkExtractor):
9696

9797
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
98-
tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
98+
tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
9999
deny_extensions=None):
100100
self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
101101
self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
@@ -105,9 +105,9 @@ def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restric
105105
self.canonicalize = canonicalize
106106
if deny_extensions is None:
107107
deny_extensions = IGNORED_EXTENSIONS
108-
self.deny_extensions = set(['.' + e for e in deny_extensions])
109-
tag_func = lambda x: x in tags
110-
attr_func = lambda x: x in attrs
108+
self.deny_extensions = {'.' + e for e in arg_to_iter(deny_extensions)}
109+
tag_func = lambda x: x in arg_to_iter(tags)
110+
attr_func = lambda x: x in arg_to_iter(attrs)
111111
BaseSgmlLinkExtractor.__init__(self,
112112
tag=tag_func,
113113
attr=attr_func,

scrapy/tests/test_contrib_linkextractors.py

+66
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,11 @@ def test_deny_extensions(self):
277277
Link(url='http://example.org/page.html', text=u'asd'),
278278
])
279279

280+
lx = SgmlLinkExtractor(deny_extensions="jpg")
281+
self.assertEqual(lx.extract_links(response), [
282+
Link(url='http://example.org/page.html', text=u'asd'),
283+
])
284+
280285
def test_process_value(self):
281286
"""Test restrict_xpaths with encodings"""
282287
html = """
@@ -304,6 +309,67 @@ def test_base_url_with_restrict_xpaths(self):
304309
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
305310

306311

312+
def test_attrs(self):
313+
lx = SgmlLinkExtractor(attrs="href")
314+
self.assertEqual(lx.extract_links(self.response), [
315+
Link(url='http://example.com/sample1.html', text=u''),
316+
Link(url='http://example.com/sample2.html', text=u'sample 2'),
317+
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
318+
Link(url='http://www.google.com/something', text=u''),
319+
Link(url='http://example.com/innertag.html', text=u'inner tag'),
320+
])
321+
322+
lx = SgmlLinkExtractor(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
323+
self.assertEqual(lx.extract_links(self.response), [
324+
Link(url='http://example.com/sample1.html', text=u''),
325+
Link(url='http://example.com/sample2.html', text=u'sample 2'),
326+
Link(url='http://example.com/sample2.jpg', text=u''),
327+
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
328+
Link(url='http://www.google.com/something', text=u''),
329+
Link(url='http://example.com/innertag.html', text=u'inner tag'),
330+
])
331+
332+
lx = SgmlLinkExtractor(attrs=None)
333+
self.assertEqual(lx.extract_links(self.response), [])
334+
335+
html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
336+
response = HtmlResponse("http://example.com/index.html", body=html)
337+
lx = SgmlLinkExtractor(attrs=("href"))
338+
self.assertEqual(lx.extract_links(response), [
339+
Link(url='http://example.com/sample1.html', text=u''),
340+
])
341+
342+
343+
def test_tags(self):
344+
html = """<html><area href="sample1.html"></area><a href="sample2.html">sample 2</a><img src="sample2.jpg"/></html>"""
345+
response = HtmlResponse("http://example.com/index.html", body=html)
346+
347+
lx = SgmlLinkExtractor(tags=None)
348+
self.assertEqual(lx.extract_links(response), [])
349+
350+
lx = SgmlLinkExtractor()
351+
self.assertEqual(lx.extract_links(response), [
352+
Link(url='http://example.com/sample1.html', text=u''),
353+
Link(url='http://example.com/sample2.html', text=u'sample 2'),
354+
])
355+
356+
lx = SgmlLinkExtractor(tags="area")
357+
self.assertEqual(lx.extract_links(response), [
358+
Link(url='http://example.com/sample1.html', text=u''),
359+
])
360+
361+
lx = SgmlLinkExtractor(tags="a")
362+
self.assertEqual(lx.extract_links(response), [
363+
Link(url='http://example.com/sample2.html', text=u'sample 2'),
364+
])
365+
366+
lx = SgmlLinkExtractor(tags=("a","img"), attrs=("href", "src"), deny_extensions=())
367+
self.assertEqual(lx.extract_links(response), [
368+
Link(url='http://example.com/sample2.html', text=u'sample 2'),
369+
Link(url='http://example.com/sample2.jpg', text=u''),
370+
])
371+
372+
307373
class HtmlParserLinkExtractorTestCase(unittest.TestCase):
308374

309375
def setUp(self):

0 commit comments

Comments
 (0)