Skip to content

Commit

Permalink
added table of stats
Browse files Browse the repository at this point in the history
  • Loading branch information
krozic committed Nov 24, 2021
1 parent 98afecf commit d713fbd
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 16 deletions.
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,30 @@ The python package `pyodbc` was then used to load the tables from this database
### Results

![Rating Distribution](./figures/rating_distribution.png)

| genre | median | top_20perc | num_films |
|:----------|---------:|-------------:|------------:|
| Horror | 6.125 | 6.8 | 1103 |
| Sci-Fi | 6.356 | 7.2 | 644 |
| Family | 6.37059 | 7.3 | 420 |
| Fantasy | 6.4 | 7.3 | 693 |
| Action | 6.41585 | 7.2 | 2041 |
| Thriller | 6.48305 | 7.3 | 1458 |
| Mystery | 6.50278 | 7.4 | 913 |
| Comedy | 6.52558 | 7.3 | 3244 |
| Adventure | 6.61695 | 7.4 | 1500 |
| Romance | 6.7 | 7.4 | 1483 |
| Sport | 6.73333 | 7.4 | 171 |
| Crime | 6.73429 | 7.4 | 1741 |
| Animation | 6.96471 | 7.7 | 426 |
| Drama | 7.00262 | 7.6 | 4790 |
| Musical | 7.15 | 7.6 | 87 |
| Biography | 7.175 | 7.62 | 585 |
| History | 7.2 | 7.7 | 295 |
| Western | 7.3 | 7.7 | 105 |
| War | 7.4 | 8 | 200 |


---

To do:
Expand Down
60 changes: 59 additions & 1 deletion SQLQuery.sql
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,68 @@ FROM IMDBRatings..TitleRatings AS tr
ON tb.tconst = te.imdb_title_id
WHERE (titleType = 'movie' OR titleType = 'tvMovie')
AND te.country NOT LIKE '%India%'
AND numVotes > 10000
AND numVotes > 10000;
-- ORDER BY averageRating ASC, numVotes ASC;


-- Exploring AVG values between media types
SELECT DISTINCT tb.titleType
FROM IMDBRatings..TitleRatings AS tr
LEFT JOIN IMDBRatings..TitleBasics AS tb
ON tr.tconst = tb.tconst
LEFT JOIN IMDBRatings..TitleExtras AS te
ON tb.tconst = te.imdb_title_id
WHERE numVotes > 10000;
--Result:
--video
--tvShort
--tvMovie
--videoGame
--short
--movie
--tvSeries
--tvMiniSeries
--tvSpecial
--tvEpisode

SELECT AVG(tr.averageRating)
FROM IMDBRatings..TitleRatings AS tr
LEFT JOIN IMDBRatings..TitleBasics AS tb
ON tr.tconst = tb.tconst
LEFT JOIN IMDBRatings..TitleExtras AS te
ON tb.tconst = te.imdb_title_id
WHERE numVotes > 10000
AND (tb.titleType = 'movie' OR tb.titleType = 'tvMovie');
-- Result: 6.65

SELECT AVG(tr.averageRating)
FROM IMDBRatings..TitleRatings AS tr
LEFT JOIN IMDBRatings..TitleBasics AS tb
ON tr.tconst = tb.tconst
LEFT JOIN IMDBRatings..TitleExtras AS te
ON tb.tconst = te.imdb_title_id
WHERE numVotes > 10000
AND (tb.titleType = 'tvSeries' OR tb.titleType = 'tvMiniSeries');
-- Result: 7.73

-- Normally, episodes receive ~10x less votes than the show rating
-- This approximately gathers episodes of comparable popularity
SELECT AVG(tr.averageRating)
FROM IMDBRatings..TitleRatings AS tr
LEFT JOIN IMDBRatings..TitleBasics AS tb
ON tr.tconst = tb.tconst
LEFT JOIN IMDBRatings..TitleExtras AS te
ON tb.tconst = te.imdb_title_id
WHERE numVotes > 1000
AND tb.titleType = 'tvEpisode';
-- Result: 8.15










Expand Down
Binary file modified Thumbs.db
Binary file not shown.
37 changes: 22 additions & 15 deletions rating_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
cursor = cnxn.cursor()

# Creating genres list

genres = [
'Action',
'Adventure',
Expand Down Expand Up @@ -66,42 +65,50 @@
db = pd.read_sql(ratings_sql, cnxn)

# Creating Table of Stats

medianVals = {'genre': [],
'median': []}
genreVals = {'genre': [],
'median': [],
'top_20perc': [],
'num_films': []}
for genre in genres:
genre_db = db[db.genres.str.contains(genre, regex=True, na=False)]
num_films = len(genre_db)
top_20perc = genre_db.averageRating.quantile(q=0.8)
genre_db['movierank'] = ((np.arange(len(genre_db))+1)/len(genre_db)*100).round(2)
median = genre_db[genre_db.movierank<52][genre_db.movierank>48].averageRating.mean()
medianVals['genre'].append(genre)
medianVals['median'].append(median)

medianVals = pd.DataFrame(medianVals).sort_values('median')

genreVals['genre'].append(genre)
genreVals['median'].append(median)
genreVals['top_20perc'].append(top_20perc)
genreVals['num_films'].append(num_films)

genreVals = pd.DataFrame(genreVals).sort_values('median')
genreVals_table = tabulate(genreVals,
headers='keys',
tablefmt='pipe',
showindex='never')

# Creating plot with custom
NUM_COLORS = len(genres)

cm = plt.get_cmap('gist_rainbow')
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_prop_cycle('color', [cm(1.*i/NUM_COLORS) for i in range(NUM_COLORS)])

for genre in medianVals.genre:
for genre in genreVals.genre:
genre_db = db[db.genres.str.contains(genre, regex=True, na=False)]
genre_db['movierank'] = ((np.arange(len(genre_db))+1)/len(genre_db)*100).round(2)
genre_db = genre_db.groupby('averageRating')['movierank'].mean()
genre_db = genre_db.reset_index()
ax.plot(genre_db.averageRating, genre_db.movierank, label=genre, alpha=0.4)

g1 = ax.grid(b=True, which='major', linestyle='-', linewidth=0.5)
g2 = ax.grid(b=True, which='minor', linestyle='-', linewidth=0.2)
g1 = ax.grid(b=True, which='major', linestyle='-')
g2 = ax.grid(b=True, which='minor', linestyle='-', linewidth=0.4)
ax.minorticks_on()
plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5))
plt.ylabel('Percent Rank')
plt.xlabel('User Rating')
plt.xticks(np.arange(0, 11, 1))
plt.yticks(np.arange(0, 101, 10))
plt.title('Rating Distribution')
plt.savefig('rating_distribution.png', dpi=600, bbox_inches='tight')
plt.savefig('rating_distribution.png', dpi=200, bbox_inches='tight')



Expand Down

0 comments on commit d713fbd

Please sign in to comment.