-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
235 lines (202 loc) · 9.3 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Script to create an heatmap between Wikipedia cuisines pages
"""
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from pathlib import Path
from urllib.parse import unquote
from tqdm import tqdm
import defs
from utils import strip_url, save_to_file, load_from_file, execute_steps, split_to_chunks
from visualization import step5_create_plots
def step1_prepare_cuisines_data():
"""Create a data structure starting from the cuisines list in Template:Cuisines"""
req = requests.get('https://en.wikipedia.org/wiki/Template:Cuisines')
soup = BeautifulSoup(req.text, features='html.parser')
html_cuisines = soup.find(title='National dish').find_next('ul')
cuisines_titles = []
skipped = []
for ch in html_cuisines:
if not isinstance(ch, str):
if len(ch.find_all('a')) > 1:
# If it has sub-cuisines (regional ones) consider only the first
cuisine = ch.find_all('a')[0]
else:
# If it's only a national cuisine
cuisine = ch.find('a')
# If it's not a redirect to a different page (e.g.: "cuisine" section in the country page)
if not cuisine.get('class'):
title, href = cuisine.get('title'), cuisine.get('href')
cuisines_titles.append((title, unquote(href.replace('/wiki/', ''))))
elif 'mw-redirect' in cuisine.get('class'):
skipped.append(cuisine.get('title'))
else:
raise ValueError(f"Undefined case: {cuisine}")
if skipped:
for skip in skipped:
print(f"[Skip] {skip} (redirect)")
api_url = 'https://en.wikipedia.org/w/api.php'
params = {'action': 'query', 'format': 'json'}
cuisines_raw = {}
for chunk in split_to_chunks(cuisines_titles, 50):
params['titles'] = f"{'|'.join([c[1] for c in chunk])}"
with requests.Session() as session:
post = session.post(api_url, params)
res = post.json()
for vv in res['query']['pages'].values():
cuisines_raw[vv['title']] = {'pageid': str(vv['pageid']), 'languages': {}}
save_to_file('data/cuisines_raw.dat', cuisines_raw)
def step2_populate_other_languages():
"""Gets URLs and titles of cuisines in multiple languages"""
cuisines_raw = load_from_file('data/cuisines_raw.dat')
wiki_url = 'https://en.wikipedia.org/w/api.php'
params = {'action': 'query', 'prop': 'langlinks|info', 'llprop': 'url', 'lllimit': 'max', 'format': 'json'}
print("Getting links for every cuisine for every language...")
for vv in tqdm(cuisines_raw.values()):
pageid = vv['pageid']
params['pageids'] = pageid
with requests.Session() as session:
post = session.post(wiki_url, params)
res = post.json()
res_info = res['query']['pages'][pageid]
if 'langlinks' in res_info:
vv['languages'] = {
vv['lang']: {
'title': vv['*'],
'wiki_url': strip_url(vv['url'])
}
for vv in res_info['langlinks']
}
vv['languages']['en'] = {}
vv['languages']['en']['length'] = res_info['length']
vv['languages']['en']['title'] = res['query']['pages'][pageid]['title']
save_to_file('data/cuisines_langs.dat', cuisines_raw)
def step3_fill_lengths():
"""Retrieve the lengths of the pages via APIs"""
cuisines = load_from_file('data/cuisines_langs.dat')
# TODO: refactor grouping together pages, do only one request for every xyz.wikipedia.org
params = {'action': 'query', 'prop': 'info', 'format': 'json'}
skipped = []
for kk, vv in tqdm(cuisines.items()):
for lang_prefix, page in tqdm(vv['languages'].items()):
if lang_prefix != 'en':
wiki_url = page['wiki_url']
api_url = f'https://{wiki_url}/w/api.php'
params['titles'] = page['title']
with requests.Session() as session:
post = session.post(api_url, params)
if post.ok:
res = post.json()
else:
print("Issue in POST call")
print(f"{api_url}\n{params}")
page_data = res['query']['pages'][next(iter(res['query']['pages']))]
if 'length' in page_data:
vv['languages'][lang_prefix]['length'] = page_data['length']
else:
skipped.append((kk, lang_prefix))
if skipped:
for page, lang in skipped:
print(f"[Skip] {page} in language {lang} (unavailable length)")
save_to_file('data/cuisines_length.dat', cuisines)
def step4_preprocess_data_frame(create_full_df=False):
"""Create pandas DataFrames filtering out undesired data"""
cuisines = load_from_file('data/cuisines_length.dat')
# Set values (depending if dataframe/dataframe_full is to create)
if create_full_df:
threshold_min_voice_length = 0
threshold_min_cuisines = 0
threshold_min_languages = 0
filename = 'data/table_dataframe_full.dat'
else:
threshold_min_voice_length = defs.THRESHOLD_MIN_VOICE_LENGTH
threshold_min_cuisines = defs.THRESHOLD_MIN_CUISINES
threshold_min_languages = defs.THRESHOLD_MIN_LANGUAGES
filename = 'data/table_dataframe.dat'
# Set pandas view options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
# Find languages to consider
languages = set()
for kk, vv in cuisines.items():
for lang in [*vv['languages'].keys()]:
languages.add(lang)
languages = [*languages]
languages.sort()
languages.insert(0, 'cuisine')
# Create full table
df_fulltable = pd.DataFrame(columns=languages)
for kk, vv in tqdm(cuisines.items()):
entry = {}
for kk2, vv2 in vv['languages'].items():
if 'length' in vv2 and kk2 in languages:
entry[kk2] = vv2['length']
# Add cuisine name (removing "cuisine")
entry['Cuisine'] = kk.replace(" cuisine", "")
df_fulltable = df_fulltable.append(entry, ignore_index=True)
short_voices = []
for (c_name, c_data) in df_fulltable.iteritems():
if c_name != 'Cuisine':
for entry in c_data.iteritems():
if not pd.isna(entry[1]) and int(entry[1]) < threshold_min_voice_length:
short_voices.append((c_name, entry[0], entry[1]))
for entry in short_voices:
df_fulltable.at[entry[1], entry[0]] = np.nan
# TODO:Fix: depending on the order different results are obtained
# Keep all languages that have least THRESHOLD_MIN_CUISINES written
df_fulltable.dropna(axis=1, thresh=threshold_min_cuisines, inplace=True)
# Keep all cuisines that appears in at least THRESHOLD_MIN_LANGUAGES languages
df_fulltable = df_fulltable[df_fulltable.isnull().sum(axis=1) < len(df_fulltable.columns) - threshold_min_languages]
df_fulltable.reset_index(drop=True, inplace=True)
df_fulltable.set_index('Cuisine', inplace=True)
df_fulltable.columns.names = ['Wikipedia language']
save_to_file(filename, df_fulltable)
# yapf: disable
STEPS = [step1_prepare_cuisines_data,
step2_populate_other_languages,
step3_fill_lengths,
step4_preprocess_data_frame]
# yapf: enable
def get_wikimedia_languages_list():
"""Download and create a correlation dict from language prefixes to long language names"""
wiki_languages = {}
req = requests.get('https://meta.wikimedia.org/wiki/Table_of_Wikimedia_projects')
soup = BeautifulSoup(req.text, features='html.parser')
table = soup.find_all('table', class_='sortable')[0]
for tr in table.find_all('tr'):
tds = tr.find_all('td')
if not tds:
continue
code, english_name, local_name = [td.text.strip() for td in tds[:3]]
code = code.replace(':', '')
wiki_languages[code] = {'eng_name': english_name, 'local_name': local_name}
save_to_file('data/wiki_languages.dat', wiki_languages)
def main():
if not Path('data/cuisines_raw.dat').exists():
execute_steps(STEPS, [i for i in range(0, len(STEPS))])
elif not Path('data/cuisines_langs.dat').exists():
execute_steps(STEPS, [i for i in range(1, len(STEPS))])
elif not Path('data/cuisines_length.dat').exists():
execute_steps(STEPS, [i for i in range(2, len(STEPS))])
elif not Path('data/table_dataframe.dat').exists():
execute_steps(STEPS, [i for i in range(3, len(STEPS))])
if not Path('data/table_dataframe_full.dat').exists():
step4_preprocess_data_frame(create_full_df=True)
if not Path('data/wiki_languages.dat').exists():
get_wikimedia_languages_list()
cc1 = load_from_file('data/cuisines_raw.dat')
cc2 = load_from_file('data/cuisines_langs.dat')
cc3 = load_from_file('data/cuisines_length.dat')
wl = load_from_file('data/wiki_languages.dat')
df = load_from_file('data/table_dataframe.dat')
df_full = load_from_file('data/table_dataframe_full.dat')
# Plot dataframe
step5_create_plots(df, df_full)
if __name__ == '__main__':
main()