-
Notifications
You must be signed in to change notification settings - Fork 220
/
Copy pathgetting_started.py
202 lines (167 loc) · 8.61 KB
/
getting_started.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
from collections import defaultdict
import csv
import numpy
import random
from scipy import sparse
from sklearn.preprocessing import MultiLabelBinarizer
import tensorrec
import logging
logging.getLogger().setLevel(logging.INFO)
# Open and read in the ratings file
# NOTE: This expects the ratings.csv file to be in the same folder as this Python file
# You can download the MovieLens dataset, including ratings.csv, here:
# https://grouplens.org/datasets/movielens/
print('Loading ratings')
with open('ratings.csv', 'r') as ratings_file:
ratings_file_reader = csv.reader(ratings_file)
raw_ratings = list(ratings_file_reader)
raw_ratings_header = raw_ratings.pop(0)
# Iterate through the input to map MovieLens IDs to new internal IDs
# The new internal IDs will be created by the defaultdict on insertion
movielens_to_internal_user_ids = defaultdict(lambda: len(movielens_to_internal_user_ids))
movielens_to_internal_item_ids = defaultdict(lambda: len(movielens_to_internal_item_ids))
for row in raw_ratings:
row[0] = movielens_to_internal_user_ids[int(row[0])]
row[1] = movielens_to_internal_item_ids[int(row[1])]
row[2] = float(row[2])
n_users = len(movielens_to_internal_user_ids)
n_items = len(movielens_to_internal_item_ids)
# Look at an example raw rating
print("Raw ratings example:\n{}\n{}".format(raw_ratings_header, raw_ratings[0]))
# Shuffle the ratings and split them in to train/test sets 80%/20%
random.shuffle(raw_ratings) # Shuffles the list in-place
cutoff = int(.8 * len(raw_ratings))
train_ratings = raw_ratings[:cutoff]
test_ratings = raw_ratings[cutoff:]
print("{} train ratings, {} test ratings".format(len(train_ratings), len(test_ratings)))
# This method converts a list of (user, item, rating, time) to a sparse matrix
def interactions_list_to_sparse_matrix(interactions):
users_column, items_column, ratings_column, _ = zip(*interactions)
return sparse.coo_matrix((ratings_column, (users_column, items_column)),
shape=(n_users, n_items))
# Create sparse matrices of interaction data
sparse_train_ratings = interactions_list_to_sparse_matrix(train_ratings)
sparse_test_ratings = interactions_list_to_sparse_matrix(test_ratings)
# Construct indicator features for users and items
user_indicator_features = sparse.identity(n_users)
item_indicator_features = sparse.identity(n_items)
# Build a matrix factorization collaborative filter model
cf_model = tensorrec.TensorRec(n_components=5)
# Fit the collaborative filter model
print("Training collaborative filter")
cf_model.fit(interactions=sparse_train_ratings,
user_features=user_indicator_features,
item_features=item_indicator_features)
# Create sets of train/test interactions that are only ratings >= 4.0
sparse_train_ratings_4plus = sparse_train_ratings.multiply(sparse_train_ratings >= 4.0)
sparse_test_ratings_4plus = sparse_test_ratings.multiply(sparse_test_ratings >= 4.0)
# This method consumes item ranks for each user and prints out recall@10 train/test metrics
def check_results(ranks):
train_recall_at_10 = tensorrec.eval.recall_at_k(
test_interactions=sparse_train_ratings_4plus,
predicted_ranks=ranks,
k=10
).mean()
test_recall_at_10 = tensorrec.eval.recall_at_k(
test_interactions=sparse_test_ratings_4plus,
predicted_ranks=ranks,
k=10
).mean()
print("Recall at 10: Train: {:.4f} Test: {:.4f}".format(train_recall_at_10,
test_recall_at_10))
# Check the results of the MF CF model
print("Matrix factorization collaborative filter:")
predicted_ranks = cf_model.predict_rank(user_features=user_indicator_features,
item_features=item_indicator_features)
check_results(predicted_ranks)
# Let's try a new loss function: WMRB
print("Training collaborative filter with WMRB loss")
ranking_cf_model = tensorrec.TensorRec(n_components=5,
loss_graph=tensorrec.loss_graphs.WMRBLossGraph())
ranking_cf_model.fit(interactions=sparse_train_ratings_4plus,
user_features=user_indicator_features,
item_features=item_indicator_features,
n_sampled_items=int(n_items * .01))
# Check the results of the WMRB MF CF model
print("WMRB matrix factorization collaborative filter:")
predicted_ranks = ranking_cf_model.predict_rank(user_features=user_indicator_features,
item_features=item_indicator_features)
check_results(predicted_ranks)
# To improve the recommendations, lets read in the movie genres
print('Loading movie metadata')
with open('movies.csv', 'r') as movies_file:
movies_file_reader = csv.reader(movies_file)
raw_movie_metadata = list(movies_file_reader)
raw_movie_metadata_header = raw_movie_metadata.pop(0)
# Map the MovieLens IDs to our internal IDs and keep track of the genres and titles
movie_genres_by_internal_id = {}
movie_titles_by_internal_id = {}
for row in raw_movie_metadata:
row[0] = movielens_to_internal_item_ids[int(row[0])] # Map to IDs
row[2] = row[2].split('|') # Split up the genres
movie_genres_by_internal_id[row[0]] = row[2]
movie_titles_by_internal_id[row[0]] = row[1]
# Look at an example movie metadata row
print("Raw metadata example:\n{}\n{}".format(raw_movie_metadata_header,
raw_movie_metadata[0]))
# Build a list of genres where the index is the internal movie ID and
# the value is a list of [Genre, Genre, ...]
movie_genres = [movie_genres_by_internal_id[internal_id]
for internal_id in range(n_items)]
# Transform the genres into binarized labels using scikit's MultiLabelBinarizer
movie_genre_features = MultiLabelBinarizer().fit_transform(movie_genres)
n_genres = movie_genre_features.shape[1]
print("Binarized genres example for movie {}:\n{}".format(movie_titles_by_internal_id[0],
movie_genre_features[0]))
# Coerce the movie genre features to a sparse matrix, which TensorRec expects
movie_genre_features = sparse.coo_matrix(movie_genre_features)
# Fit a content-based model using the genres as item features
print("Training content-based recommender")
content_model = tensorrec.TensorRec(
n_components=n_genres,
item_repr_graph=tensorrec.representation_graphs.FeaturePassThroughRepresentationGraph(),
loss_graph=tensorrec.loss_graphs.WMRBLossGraph()
)
content_model.fit(interactions=sparse_train_ratings_4plus,
user_features=user_indicator_features,
item_features=movie_genre_features,
n_sampled_items=int(n_items * .01))
# Check the results of the content-based model
print("Content-based recommender:")
predicted_ranks = content_model.predict_rank(user_features=user_indicator_features,
item_features=movie_genre_features)
check_results(predicted_ranks)
# Try concatenating the genres on to the indicator features for a hybrid recommender system
full_item_features = sparse.hstack([item_indicator_features, movie_genre_features])
print("Training hybrid recommender")
hybrid_model = tensorrec.TensorRec(
n_components=5,
loss_graph=tensorrec.loss_graphs.WMRBLossGraph()
)
hybrid_model.fit(interactions=sparse_train_ratings_4plus,
user_features=user_indicator_features,
item_features=full_item_features,
n_sampled_items=int(n_items * .01))
print("Hybrid recommender:")
predicted_ranks = hybrid_model.predict_rank(user_features=user_indicator_features,
item_features=full_item_features)
check_results(predicted_ranks)
# Print out movies that User #432 has liked
print("User 432 liked:")
for m in sparse_train_ratings_4plus[432].indices:
print(movie_titles_by_internal_id[m])
# Pull user 432's features out of the user features matrix and predict movie ranks for just that user
u432_features = sparse.csr_matrix(user_indicator_features)[432]
u432_rankings = hybrid_model.predict_rank(user_features=u432_features,
item_features=full_item_features)[0]
# Get internal IDs of User 432's top 10 recommendations
# These are sorted by item ID, not by rank
# This may contain items with which User 432 has already interacted
u432_top_ten_recs = numpy.where(u432_rankings <= 10)[0]
print("User 432 recommendations:")
for m in u432_top_ten_recs:
print(movie_titles_by_internal_id[m])
# Print out User 432's held-out interactions
print("User 432's held-out movies:")
for m in sparse_test_ratings_4plus[432].indices:
print(movie_titles_by_internal_id[m])