replace htmllaundry with beautifulsoup

nstapelbroek · Mar 26, 2024 · 14d3ad1 · 14d3ad1
1 parent e937e7e
commit 14d3ad1
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 4 deletions.
diff --git a/crawler.py b/crawler.py
@@ -22,7 +22,7 @@
     "USER_AGENT",
     getenv(
         "USER_AGENT",
-        "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
     ),
 )
 settings.set("CONCURRENT_REQUESTS", getenv("CONCURRENT_REQUESTS", "8"))

diff --git a/estate_crawler/util.py b/estate_crawler/util.py
@@ -1,7 +1,7 @@
 # coding=utf-8
 from scrapy.selector import Selector
 from scrapy.http.response.html import HtmlResponse
-from htmllaundry import strip_markup
+from bs4 import BeautifulSoup
 from urllib.parse import urlparse
 
 
@@ -56,8 +56,8 @@ def string(html, css_selector="*") -> str:
         if isinstance(data, int):
             data = str(data)
 
-        data = strip_markup(data)
-        return data.strip()
+        soup = BeautifulSoup(data, 'html.parser')
+        return soup.get_text()
 
     @staticmethod
     def volume(html, css_selector="*") -> float: