@@ -277,6 +277,11 @@ def test_deny_extensions(self):
277
277
Link (url = 'http://example.org/page.html' , text = u'asd' ),
278
278
])
279
279
280
+ lx = SgmlLinkExtractor (deny_extensions = "jpg" )
281
+ self .assertEqual (lx .extract_links (response ), [
282
+ Link (url = 'http://example.org/page.html' , text = u'asd' ),
283
+ ])
284
+
280
285
def test_process_value (self ):
281
286
"""Test restrict_xpaths with encodings"""
282
287
html = """
@@ -304,6 +309,67 @@ def test_base_url_with_restrict_xpaths(self):
304
309
[Link (url = 'http://otherdomain.com/base/item/12.html' , text = 'Item 12' )])
305
310
306
311
312
+ def test_attrs (self ):
313
+ lx = SgmlLinkExtractor (attrs = "href" )
314
+ self .assertEqual (lx .extract_links (self .response ), [
315
+ Link (url = 'http://example.com/sample1.html' , text = u'' ),
316
+ Link (url = 'http://example.com/sample2.html' , text = u'sample 2' ),
317
+ Link (url = 'http://example.com/sample3.html' , text = u'sample 3 text' ),
318
+ Link (url = 'http://www.google.com/something' , text = u'' ),
319
+ Link (url = 'http://example.com/innertag.html' , text = u'inner tag' ),
320
+ ])
321
+
322
+ lx = SgmlLinkExtractor (attrs = ("href" ,"src" ), tags = ("a" ,"area" ,"img" ), deny_extensions = ())
323
+ self .assertEqual (lx .extract_links (self .response ), [
324
+ Link (url = 'http://example.com/sample1.html' , text = u'' ),
325
+ Link (url = 'http://example.com/sample2.html' , text = u'sample 2' ),
326
+ Link (url = 'http://example.com/sample2.jpg' , text = u'' ),
327
+ Link (url = 'http://example.com/sample3.html' , text = u'sample 3 text' ),
328
+ Link (url = 'http://www.google.com/something' , text = u'' ),
329
+ Link (url = 'http://example.com/innertag.html' , text = u'inner tag' ),
330
+ ])
331
+
332
+ lx = SgmlLinkExtractor (attrs = None )
333
+ self .assertEqual (lx .extract_links (self .response ), [])
334
+
335
+ html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
336
+ response = HtmlResponse ("http://example.com/index.html" , body = html )
337
+ lx = SgmlLinkExtractor (attrs = ("href" ))
338
+ self .assertEqual (lx .extract_links (response ), [
339
+ Link (url = 'http://example.com/sample1.html' , text = u'' ),
340
+ ])
341
+
342
+
343
+ def test_tags (self ):
344
+ html = """<html><area href="sample1.html"></area><a href="sample2.html">sample 2</a><img src="sample2.jpg"/></html>"""
345
+ response = HtmlResponse ("http://example.com/index.html" , body = html )
346
+
347
+ lx = SgmlLinkExtractor (tags = None )
348
+ self .assertEqual (lx .extract_links (response ), [])
349
+
350
+ lx = SgmlLinkExtractor ()
351
+ self .assertEqual (lx .extract_links (response ), [
352
+ Link (url = 'http://example.com/sample1.html' , text = u'' ),
353
+ Link (url = 'http://example.com/sample2.html' , text = u'sample 2' ),
354
+ ])
355
+
356
+ lx = SgmlLinkExtractor (tags = "area" )
357
+ self .assertEqual (lx .extract_links (response ), [
358
+ Link (url = 'http://example.com/sample1.html' , text = u'' ),
359
+ ])
360
+
361
+ lx = SgmlLinkExtractor (tags = "a" )
362
+ self .assertEqual (lx .extract_links (response ), [
363
+ Link (url = 'http://example.com/sample2.html' , text = u'sample 2' ),
364
+ ])
365
+
366
+ lx = SgmlLinkExtractor (tags = ("a" ,"img" ), attrs = ("href" , "src" ), deny_extensions = ())
367
+ self .assertEqual (lx .extract_links (response ), [
368
+ Link (url = 'http://example.com/sample2.html' , text = u'sample 2' ),
369
+ Link (url = 'http://example.com/sample2.jpg' , text = u'' ),
370
+ ])
371
+
372
+
307
373
class HtmlParserLinkExtractorTestCase (unittest .TestCase ):
308
374
309
375
def setUp (self ):
0 commit comments