forked from hhursev/recipe-scrapers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path_abstract.py
212 lines (174 loc) · 7.27 KB
/
_abstract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# mypy: disallow_untyped_defs=False
import inspect
from collections import OrderedDict
from typing import Dict, List, Optional, Tuple, Union
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from recipe_scrapers.settings import settings
from ._grouping_utils import IngredientGroup
from ._schemaorg import SchemaOrg
# Some sites close their content for 'bots', so user-agent must be supplied
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0"
}
class AbstractScraper:
page_data: Union[str, bytes]
def __init__(
self,
url: Union[str, None],
proxies: Optional[
Dict[str, str]
] = None, # allows us to specify optional proxy server
timeout: Optional[
Union[float, Tuple[float, float], Tuple[float, None]]
] = None, # allows us to specify optional timeout for request
wild_mode: Optional[bool] = False,
html: Union[str, bytes, None] = None,
):
if html:
self.page_data = html
self.url = url
else:
assert url is not None, "url required for fetching recipe data"
resp = requests.get(
url,
headers=HEADERS,
proxies=proxies,
timeout=timeout,
)
self.page_data = resp.content
self.url = resp.url
self.wild_mode = wild_mode
self.soup = BeautifulSoup(self.page_data, "html.parser")
self.schema = SchemaOrg(self.page_data)
# attach the plugins as instructed in settings.PLUGINS
if not hasattr(self.__class__, "plugins_initialized"):
for name, _ in inspect.getmembers(self, inspect.ismethod):
current_method = getattr(self.__class__, name)
for plugin in reversed(settings.PLUGINS):
if plugin.should_run(self.host(), name):
current_method = plugin.run(current_method)
setattr(self.__class__, name, current_method)
setattr(self.__class__, "plugins_initialized", True)
@classmethod
def host(cls) -> str:
"""Host domain of the recipe URL."""
raise NotImplementedError("This should be implemented.")
def canonical_url(self):
"""Canonical or original URL of the recipe."""
canonical_link = self.soup.find("link", {"rel": "canonical", "href": True})
if canonical_link:
return urljoin(self.url, canonical_link["href"])
return self.url
def title(self):
"""Title of the recipe."""
raise NotImplementedError("This should be implemented.")
def category(self):
"""Category of the recipe."""
raise NotImplementedError("This should be implemented.")
def total_time(self):
"""Total time needed to prepare and cook the recipe in minutes."""
raise NotImplementedError("This should be implemented.")
def cook_time(self):
"""Cooking time in minutes."""
raise NotImplementedError("This should be implemented.")
def prep_time(self):
"""Preparation time in minutes."""
raise NotImplementedError("This should be implemented.")
def cooking_method(self):
"""The method of cooking the recipe"""
raise NotImplementedError("This should be implemented.")
def yields(self):
"""Total servings or items in the recipe."""
raise NotImplementedError("This should be implemented.")
def image(self):
"""An image URL for the recipe."""
raise NotImplementedError("This should be implemented.")
def nutrients(self):
"""Nutrients of the recipe."""
raise NotImplementedError("This should be implemented.")
def language(self):
"""Language the recipe is written in."""
candidate_languages = OrderedDict()
html = self.soup.find("html", {"lang": True})
candidate_languages[html.get("lang")] = True
# Deprecated: check for a meta http-equiv header
# See: https://www.w3.org/International/questions/qa-http-and-lang
meta_language = self.soup.find(
"meta",
{
"http-equiv": lambda x: x and x.lower() == "content-language",
"content": True,
},
)
if meta_language:
language = meta_language.get("content").split(",", 1)[0]
if language:
candidate_languages[language] = True
# If other languages exist, remove 'en' commonly generated by HTML editors
if len(candidate_languages) > 1:
candidate_languages.pop("en", None)
# Return the first candidate language
return candidate_languages.popitem(last=False)[0]
def ingredients(self):
"""Ingredients of the recipe."""
raise NotImplementedError("This should be implemented.")
def ingredient_groups(self) -> List[IngredientGroup]:
"""List of ingredient groups."""
return [IngredientGroup(purpose=None, ingredients=self.ingredients())]
def instructions(self) -> str:
"""Instructions to prepare the recipe."""
raise NotImplementedError("This should be implemented.")
def instructions_list(self) -> List[str]:
"""Instructions to prepare the recipe as a list."""
return [
instruction
for instruction in self.instructions().split("\n")
if instruction
]
def ratings(self):
"""Ratings of the recipe."""
raise NotImplementedError("This should be implemented.")
def author(self):
"""Author of the recipe."""
raise NotImplementedError("This should be implemented.")
def cuisine(self):
"""Cuisine of the recipe."""
raise NotImplementedError("This should be implemented.")
def description(self):
"""Description of the recipe."""
raise NotImplementedError("This should be implemented.")
def reviews(self):
"""Reviews of the recipe."""
raise NotImplementedError("This should be implemented.")
def equipment(self):
"""Equipment needed for the recipe."""
raise NotImplementedError("This should be implemented.")
def links(self):
"""Links found in the recipe."""
invalid_href = {"#", ""}
links_html = self.soup.findAll("a", href=True)
return [link.attrs for link in links_html if link["href"] not in invalid_href]
def site_name(self):
"""Name of the website."""
meta = self.soup.find("meta", property="og:site_name")
return meta.get("content") if meta else None
def to_json(self):
"""Recipe information in JSON format."""
json_dict = {}
public_method_names = [
method
for method in dir(self)
if callable(getattr(self, method))
if not method.startswith("_") and method not in ["soup", "links", "to_json"]
]
for method in public_method_names:
try:
if method == "ingredient_groups":
json_dict[method] = [i.__dict__ for i in getattr(self, method)()]
else:
json_dict[method] = getattr(self, method)()
except Exception:
pass
return json_dict