Fixed SgmlLinkExtractor constructor to properly handle both string and list parameters (attrs, tags, deny_extensions)

ananana · ananana · commit 73109bf950ad · 2014-03-27T16:12:00.000+02:00
diff --git a/docs/topics/link-extractors.rst b/docs/topics/link-extractors.rst
@@ -69,8 +69,9 @@ SgmlLinkExtractor
         domains which won't be considered for extracting the links
     :type deny_domains: str or list
 
-    :param deny_extensions: a list of extensions that should be ignored when
-        extracting links. If not given, it will default to the
+    :param deny_extensions: a single value or list of strings containing
+        extensions that should be ignored when extracting links. 
+        If not given, it will default to the
         ``IGNORED_EXTENSIONS`` list defined in the `scrapy.linkextractor`_
         module.
     :type deny_extensions: list
@@ -85,7 +86,7 @@ SgmlLinkExtractor
         Defaults to ``('a', 'area')``.
     :type tags: str or list
 
-    :param attrs: list of attributes which should be considered when looking
+    :param attrs: an attribute or list of attributes which should be considered when looking
         for links to extract (only for those tags specified in the ``tags``
         parameter). Defaults to ``('href',)``
     :type attrs: list
diff --git a/scrapy/contrib/linkextractors/sgml.py b/scrapy/contrib/linkextractors/sgml.py
@@ -95,7 +95,7 @@ def matches(self, url):
 class SgmlLinkExtractor(BaseSgmlLinkExtractor):
 
     def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
-                 tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
+                 tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
                  deny_extensions=None):
         self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
         self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
@@ -105,9 +105,9 @@ def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restric
         self.canonicalize = canonicalize
         if deny_extensions is None:
             deny_extensions = IGNORED_EXTENSIONS
-        self.deny_extensions = set(['.' + e for e in deny_extensions])
-        tag_func = lambda x: x in tags
-        attr_func = lambda x: x in attrs
+        self.deny_extensions = {'.' + e for e in arg_to_iter(deny_extensions)}
+        tag_func = lambda x: x in arg_to_iter(tags)
+        attr_func = lambda x: x in arg_to_iter(attrs)
         BaseSgmlLinkExtractor.__init__(self,
                                        tag=tag_func,
                                        attr=attr_func,
diff --git a/scrapy/tests/test_contrib_linkextractors.py b/scrapy/tests/test_contrib_linkextractors.py
@@ -277,6 +277,11 @@ def test_deny_extensions(self):
             Link(url='http://example.org/page.html', text=u'asd'),
         ])
 
+        lx = SgmlLinkExtractor(deny_extensions="jpg")
+        self.assertEqual(lx.extract_links(response), [
+            Link(url='http://example.org/page.html', text=u'asd'),
+        ])
+
     def test_process_value(self):
         """Test restrict_xpaths with encodings"""
         html = """
@@ -304,6 +309,67 @@ def test_base_url_with_restrict_xpaths(self):
                          [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
 
 
+    def test_attrs(self):
+        lx = SgmlLinkExtractor(attrs="href")
+        self.assertEqual(lx.extract_links(self.response), [
+            Link(url='http://example.com/sample1.html', text=u''),
+            Link(url='http://example.com/sample2.html', text=u'sample 2'),
+            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
+            Link(url='http://www.google.com/something', text=u''),
+            Link(url='http://example.com/innertag.html', text=u'inner tag'),
+        ])
+
+        lx = SgmlLinkExtractor(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
+        self.assertEqual(lx.extract_links(self.response), [
+            Link(url='http://example.com/sample1.html', text=u''),
+            Link(url='http://example.com/sample2.html', text=u'sample 2'),
+            Link(url='http://example.com/sample2.jpg', text=u''),
+            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
+            Link(url='http://www.google.com/something', text=u''),
+            Link(url='http://example.com/innertag.html', text=u'inner tag'),
+        ])
+
+        lx = SgmlLinkExtractor(attrs=None)
+        self.assertEqual(lx.extract_links(self.response), [])
+
+        html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
+        response = HtmlResponse("http://example.com/index.html", body=html)
+        lx = SgmlLinkExtractor(attrs=("href"))
+        self.assertEqual(lx.extract_links(response), [
+            Link(url='http://example.com/sample1.html', text=u''),
+        ])
+
+
+    def test_tags(self):
+        html = """<html><area href="sample1.html"></area><a href="sample2.html">sample 2</a><img src="sample2.jpg"/></html>"""
+        response = HtmlResponse("http://example.com/index.html", body=html)
+
+        lx = SgmlLinkExtractor(tags=None)
+        self.assertEqual(lx.extract_links(response), [])
+
+        lx = SgmlLinkExtractor()
+        self.assertEqual(lx.extract_links(response), [
+            Link(url='http://example.com/sample1.html', text=u''),
+            Link(url='http://example.com/sample2.html', text=u'sample 2'),
+        ])
+
+        lx = SgmlLinkExtractor(tags="area")
+        self.assertEqual(lx.extract_links(response), [
+            Link(url='http://example.com/sample1.html', text=u''),
+        ])
+
+        lx = SgmlLinkExtractor(tags="a")
+        self.assertEqual(lx.extract_links(response), [
+            Link(url='http://example.com/sample2.html', text=u'sample 2'),
+        ])
+
+        lx = SgmlLinkExtractor(tags=("a","img"), attrs=("href", "src"), deny_extensions=())
+        self.assertEqual(lx.extract_links(response), [
+            Link(url='http://example.com/sample2.html', text=u'sample 2'),
+            Link(url='http://example.com/sample2.jpg', text=u''),
+        ])
+
+
 class HtmlParserLinkExtractorTestCase(unittest.TestCase):
 
     def setUp(self):