-
Notifications
You must be signed in to change notification settings - Fork 88
/
Copy pathbs4_parser.py
41 lines (34 loc) · 1.55 KB
/
bs4_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
class HTMLParser(object):
"""
HTML parser using BeautifulSoup4
"""
def html_to_text(self, html_tree: BeautifulSoup) -> str:
"""
Convert HTML content to plain text using BeautifulSoup4.
Returns:
str: Extracted plain text with scripts and styles removed
"""
for script in html_tree(['script', 'style']):
script.extract()
text = html_tree.get_text(' ', strip=True)
return text
def get_html_tree(self, page: bytes, encoding: str=None, features='lxml', **kwargs) -> BeautifulSoup:
"""
Return the HTML tree object
Args:
page (bytes): Raw HTML content as bytes
encoding (str, optional): Specific character encoding to use. If None, auto-detection is attempted
features: Parser to be used (default='lxml'). Refer https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for supported parsers.
**kwargs: Additional arguments passed to BeautifulSoup constructor.
Refer here https://www.crummy.com/software/BeautifulSoup/bs4/doc/#bs4.BeautifulSoup for accepted arguments.
Returns:
BeautifulSoup: HTML tree object
"""
if not encoding:
for encoding in EncodingDetector(page, is_html=True).encodings:
# take the first detected encoding
break
soup = BeautifulSoup(page, features, from_encoding=encoding, **kwargs)
return soup