Skip to content

Commit

Permalink
replace htmllaundry with beautifulsoup
Browse files Browse the repository at this point in the history
  • Loading branch information
nstapelbroek committed Mar 26, 2024
1 parent e937e7e commit 14d3ad1
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 4 deletions.
2 changes: 1 addition & 1 deletion crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"USER_AGENT",
getenv(
"USER_AGENT",
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
),
)
settings.set("CONCURRENT_REQUESTS", getenv("CONCURRENT_REQUESTS", "8"))
Expand Down
6 changes: 3 additions & 3 deletions estate_crawler/util.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# coding=utf-8
from scrapy.selector import Selector
from scrapy.http.response.html import HtmlResponse
from htmllaundry import strip_markup
from bs4 import BeautifulSoup
from urllib.parse import urlparse


Expand Down Expand Up @@ -56,8 +56,8 @@ def string(html, css_selector="*") -> str:
if isinstance(data, int):
data = str(data)

data = strip_markup(data)
return data.strip()
soup = BeautifulSoup(data, 'html.parser')
return soup.get_text()

@staticmethod
def volume(html, css_selector="*") -> float:
Expand Down

0 comments on commit 14d3ad1

Please sign in to comment.