forked from WuLC/MachineLearningAlgorithm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCollaborativeFiltering.py
189 lines (156 loc) · 7.62 KB
/
CollaborativeFiltering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# -*- coding: utf-8 -*-
# @Author: WuLC
# @Date: 2016-04-12 15:53:02
# @Last modified by: WuLC
# @Last Modified time: 2016-04-12 20:26:32
# @Email: [email protected]
# @Function: implementation of User-based collaborative filetering
# @Referer: chaper 2 of the book 《programming-collective-intelligence》
# sample data for test
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 3.5},
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
'The Night Listener': 4.5, 'Superman Returns': 4.0,
'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 2.0},
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane': 4.5, 'You, Me and Dupree': 1.0,'Superman Returns': 4.0}}
import math
# four different methonds to caculate users' similarity
def user_similarity_on_euclidean(scores,user1,user2):
"""caculate similarity of two users based on Euclidean Distance
Args:
scores (dict{dict{}}): group of users' scores on some movies
user1 (str): one of the user
user2 (str): the other user
Returns:
float: reciprocal of Euclidean Distance(range(0,1)) between user1 and user2, the bigger the more similar
"""
commom = [movie for movie in scores[user1] if movie in scores[user2]]
if len(commom) == 0: #no common item of the two users
return 0
total = sum([math.pow(scores[user1][movie] - scores[user2][movie], 2)
for movie in commom])
similarity=math.sqrt(total)
return 1/(total+1)
def user_similarity_on_cosine(scores,user1,user2):
"""caculate similarity of two users based on cosine similarity
Args:
scores (dict{dict{}}): group of users' scores on some movies
user1 (str): one of the user
user2 (str): the other user
Returns:
float: cosine similarity(range(-1,1)) between user1 and user2, the bigger the more similar
"""
commom = [movie for movie in scores[user1] if movie in scores[user2]]
if len(commom) == 0: #no common item of the two users
return 0
pow_sum_1=sum([math.pow(scores[user1][movie], 2) for movie in commom])
pow_sum_2=sum([math.pow(scores[user2][movie], 2) for movie in commom])
multiply_sum=sum([scores[user1][movie] * scores[user2][movie] for movie in commom])
if pow_sum_1 == 0 or pow_sum_2 == 0:
return 0
else:
similarity = multiply_sum/math.sqrt(pow_sum_2*pow_sum_1)
return similarity
def user_similarity_on_modified_cosine(scores, user1, user2):
"""caculate similarity of two users based on modified cosine similarity
Args:
scores (dict{dict{}}): group of users' scores on some movies
user1 (str): one of the user
user2 (str): the other user
Returns:
float: modified cosine similarity(range(-1,1)) between user1 and user2, the bigger the more similar
"""
commom = [movie for movie in scores[user1] if movie in scores[user2]]
if len(commom) == 0: #no common item of the two users
return 0
average1 = float(sum(scores[user1][movie] for movie in scores[user1]))/len(scores[user1])
average2 = float(sum(scores[user2][movie] for movie in scores[user2]))/len(scores[user2])
# denominator
multiply_sum = sum( (scores[user1][movie]-average1) * (scores[user2][movie]-average2) for movie in commom )
# member
pow_sum_1 = sum( math.pow(scores[user1][movie]-average1, 2) for movie in scores[user1] )
pow_sum_2 = sum( math.pow(scores[user2][movie]-average2, 2) for movie in scores[user2] )
modified_cosine_similarity = float(multiply_sum)/math.sqrt(pow_sum_1*pow_sum_2)
return modified_cosine_similarity
def user_similarity_on_pearson(scores, user1, user2):
"""caculate similarity of two users based on Pearson Correlation Coefficient
Args:
scores (dict{dict{}}): group of users' scores on some movies
user1 (str): one of the user
user2 (str): the other user
Returns:
float: Pearson Correlation Coefficient(range(-1,1)) between user1 and user2, the bigger the more similar
"""
commom = [movie for movie in scores[user1] if movie in scores[user2]]
if len(commom) == 0: #no common item of the two users
return 0
average1 = float(sum(scores[user1][movie] for movie in scores[user1]))/len(scores[user1])
average2 = float(sum(scores[user2][movie] for movie in scores[user2]))/len(scores[user2])
# denominator
multiply_sum = sum( (scores[user1][movie]-average1) * (scores[user2][movie]-average2) for movie in commom )
# member
pow_sum_1 = sum( math.pow(scores[user1][movie]-average1, 2) for movie in commom )
pow_sum_2 = sum( math.pow(scores[user2][movie]-average2, 2) for movie in commom )
modified_cosine_similarity = float(multiply_sum)/math.sqrt(pow_sum_1*pow_sum_2)
return modified_cosine_similarity
def find_similar_users(scores,user,similar_function = user_similarity_on_modified_cosine):
"""find similar users based on the similar-function defined above
Args:
scores (dict{dict{}}): group of users' scores on some movies
user (str): certain user
similar_function (function): certain similar-function defined above
Returns:
list[tuple]: list of users similar to the given user with their score
"""
similar_users = [(similar_function(critics, user, otherUser), otherUser) for otherUser in scores if otherUser!=user]
similar_users.sort() # sort the users in terms of theri similarity score
similar_users.reverse()
# the above two lines are equal to : similar_users.sort(reverse = True)
return similar_users
def recommend_item(scores,user):
"""recommend items to user in terms of scores ,
scores are caculated from similar users to the given user
Args:
scores (dict{dict{}}): group of users' scores on some movies
user (str): certain user
Returns:
list[tuple]: recommend items sorted in terms of their score
"""
similar_users = find_similar_users(scores, user)
swap_similar_users = {v:k for k, v in similar_users} # 交换键值,将存储kv对的列表转换为字典,交换后为无序
all_movies = []
for (k,v) in critics.items():
for movie in v:
if movie not in all_movies:
all_movies.append(movie)
item_score = []
for movie in all_movies:
score_sum = 0
similarity_sum = 0
for similarity, otherUser in similar_users:
if critics[otherUser].has_key(movie):
score_sum += critics[otherUser][movie] * similarity
similarity_sum += swap_similar_users[otherUser]
item_score.append((score_sum/similarity_sum, movie))
item_score.sort(reverse=True)
return item_score
if __name__ == '__main__':
similarList = find_similar_users(critics, 'Lisa Rose')
for i in similarList:
print i
'''
item_score = recommend_item(critics,'Lisa Rose')
for i,j in item_score:
print i,j
'''