-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctions.py
105 lines (82 loc) · 4.49 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import json
import requests
from bs4 import BeautifulSoup
import re
from classes import Article
def get_article(target_url=str):
"""
Crawlt den übergebenen String (Webseite), extrahiert die gewünschten Daten & Metadaten (Datum, Uhrzeit, Titel, Untertitel, Inhalt, Autor:in(nen), URL) des Artikels und gibt sie als `Article`-Objekt zurück
Args:
target_url (str): Webadresse des zu verarbeitenden Artikels
Returns: Objekt der Klasse `Article`(`date`, `time`, `title`, `subtitle`, `body`, `author`, `url`)
"""
raw = requests.get(target_url)
raw_soup = BeautifulSoup(raw.content, "html.parser")
og_url = str(raw_soup.find('meta',attrs={'property': 'og:url'})).split('"')[1]
r = requests.get(og_url)
soup = BeautifulSoup(r.content, "html.parser")
try:
author = str(soup.find('meta',attrs={'name': 'author'})).split('"')[1]
except:
author = 'N/A (Agenturmeldung)'
title = str(soup.find('meta',attrs={'property': 'og:title'})).split('"')[1]
title = re.sub(r'&', '&', title)
try:
subtitle = str(soup.find('meta',attrs={'property': 'og:description'})).split('"')[1]
subtitle = re.sub(r'&', '&', subtitle)
except:
subtitle = 'NONE'
article_tags = []
script = soup.find_all('script',type='application/javascript')[3].text.strip()
script_split_tags = script.split(f',"breadcrumbs":[')[1]
script_split_tags_dict = script_split_tags.split(f'],"canonicalUrl":"')[0]
cat_data = json.loads(script_split_tags_dict)
cat_data_canon = cat_data['href'].split('/')
for i in range(1,len(cat_data_canon)):
article_tags.append(cat_data_canon[i].lower())
article_tags.append(cat_data['title'].lower())
article_tags = [*set(article_tags)]
script_split = script.split(f',"elements":')
script_body = script_split[1].split('],"headline":')[0] + ']'
data = json.loads(script_body)
body = []
for _ in data:
if _['type'] == 'text' or _['type'] == 'header':
if('Laden Sie sich jetzt hier kostenfrei unsere neue LVZ-App herunter' in _['text'] or 'Lesen Sie auch' in _['text'] or 'LVZ+ gratis' in _['text'] or 'Für iOS' in _['text'] or 'Für Android' in _['text']):
pass
elif '<a href=' in _['text'] or '<strong>' in _['text'] or ' ' in _['text'] or '<b>' in _['text'] or '</b>' in _['text'] or '<br/>' in _['text'] or '<em>' in _['text'] or '<b style=' in _['text'] or '&' in _['text'] or '<i>' in _['text'] or '</i>' in _['text']:
text = re.sub(r'(<a href="\S+)">', '', _['text'])
text = re.sub(r'<a href="\S+"', '', text)
text = re.sub(r'target="_blank">', '', text)
text = re.sub(r'target="_self">', '', text)
text = re.sub(r'</a>$', '', text)
text = re.sub(r'</a>', ' ', text)
text = re.sub(r'<strong>', '', text)
text = re.sub(r'</strong>$\S', ' ', text)
text = re.sub(r'</strong>', '', text)
text = re.sub(r' \d\d', '', text)
text = re.sub(r' ', '', text)
text = re.sub(r'^(<b style="[\s\S]+)">', '', text)
text = re.sub(r'(<b style="[\s\S]+)">', ' ', text)
text = re.sub(r'^<b>', '', text)
text = re.sub(r'<b>', ' ', text)
text = re.sub(r'</b>$\S', ' ', text)
text = re.sub(r'<b>', '', text)
text = re.sub(r'</b>', '', text)
text = re.sub(r'<br/>', '', text)
text = re.sub(r'<em>', '', text)
text = re.sub(r'</i>$', '', text)
text = re.sub(r'<i>', '', text)
text = re.sub(r'&', '&', text)
text = re.sub(r'„', '"', text)
text = re.sub(r'“', '"', text)
text = re.sub(r'\s{2,10}', ' ', text) #entferne alle evtl. entstandenen mehrfache Leerzeichen!
body.append(text)
else:
body.append(_['text'])
script_meta = script_split[0].split(';Fusion.globalContent=')[1] + '}'
meta_data = json.loads(script_meta)
raw_date = meta_data['displayDate']
article_date = f'{raw_date[8:10]}.{raw_date[5:7]}.{raw_date[0:4]}'
article_time = f'{raw_date[11:16]} Uhr'
return Article(article_date, article_time, title, subtitle, body, author, og_url, article_tags)