-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrank_table.py
65 lines (59 loc) · 2.24 KB
/
rank_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import numpy as np
import pandas as pd
from typing import Tuple, List
all_ratings = pd.read_csv('./tables/all_ratings.csv')
# Creating genres list
genres = [
'Action',
'Adventure',
'Animation',
'Biography',
'Comedy',
'Crime',
'Drama',
'Family',
'Fantasy',
'History',
'Horror',
'Musical',
'Mystery',
'Romance',
'Sci-Fi',
'Sport',
'Thriller',
'War',
'Western'
]
def build_rank_tables(all_ratings: pd.DataFrame, genres: list[str], rating: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
db = all_ratings[['genres', rating]].dropna().sort_values(rating)
genre_rank = None
genre_medians = {
'genre': [],
'median': []
}
for genre in genres:
genre_db = db[db.genres.str.contains(genre, regex=True, na=False)]
genre_db[genre] = ((np.arange(len(genre_db)) + 1) / len(genre_db) * 100).round(2)
median = genre_db[genre_db[genre]<52][genre_db[genre]>48][rating].mean()
genre_medians['genre'].append(genre)
genre_medians['median'].append(median)
genre_db = genre_db.groupby(rating)[genre].mean().round(1)
genre_db = genre_db.reset_index()
if genre_rank is not None:
genre_rank = pd.merge(genre_rank, genre_db, how='outer', on=rating)
else:
genre_rank = genre_db
genre_rank = genre_rank.sort_values(rating)
genre_rank.rename(columns={genre_rank.columns[0]: 'rating'}, inplace=True)
genre_medians = pd.DataFrame(genre_medians).sort_values('median')
return genre_rank, genre_medians
rating_source = {'imdb':'averageRating', 'rt':'rt_rating', 'mc':'mc_rating'}
rank_tables = {}
for source in rating_source:
rank_tables[source+'_genre_rank'], rank_tables[source+'_genre_medians'] = build_rank_tables(all_ratings,
genres,
rating_source[source])
for table in rank_tables:
rank_tables[table].to_csv(f'./tables/{table}.csv', index=False, line_terminator='\n')
table_names = pd.DataFrame(rank_tables.keys())
table_names.to_csv('./tables/table_names.csv', index=False, line_terminator='\n')