-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathutils.py
146 lines (119 loc) · 4.68 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import numpy
import scipy.sparse
import os.path
def read_dense_data(train_file, test_file, user_file, item_file, feature_mat_type):
return
def loadTripleData(filename):
'''
laod triple data (row, column, value) to csc_matrix format
'''
fData = numpy.loadtxt(filename, delimiter=',').T
fData = scipy.sparse.coo_matrix((fData[2],(fData[0],fData[1]))).tocsc()
return(fData)
def read_triple_data(train, test, user, item, feature_mat_type):
'''
read data from three column format (row, column, value)
'''
assert( feature_mat_type in ['sparse', 'dense', 'log_dense'] ), 'Unrecognized link function'
# need to make sure training & testing data with the same shapes as user and item features
num_user = num_item = 0
if user != '':
X_userFeat = loadTripleData(user)
num_user = X_userFeat.shape[0]
if item != '':
X_itemFeat = loadTripleData(item)
num_item = X_itemFeat.shape[0]
Dtrain = numpy.loadtxt(train, delimiter = ',').T
Dtest = numpy.loadtxt(test, delimiter = ',').T
num_user = int( max(Dtrain[0].max(), Dtest[0].max(), num_user-1) ) + 1
num_item = int( max(Dtrain[1].max(), Dtest[1].max(), num_item-1) ) + 1
X_train = scipy.sparse.coo_matrix((Dtrain[2],(Dtrain[0],Dtrain[1])), shape=(num_user, num_item)).tocsc()
X_test = scipy.sparse.coo_matrix((Dtest[2],(Dtest[0],Dtest[1])), shape=(num_user, num_item)).tocsc()
# transform to csc format
# X_train = scipy.sparse.csc_matrix(X_train)
# X_test = scipy.sparse.csc_matrix(X_test)
# user or item features
if user != '' and item != '':
Xs_trn = [X_train, X_userFeat, X_itemFeat]
Xs_tst = [X_test, None, None]
rc_schema = numpy.array([[0, 1], [0, 2], [1, 3]])
# [row entity number, column entity number]
# 0=user, 1=item, 2=userFeat, 3=itemFeat
modes = ['sparse', feature_mat_type, feature_mat_type]
# modes of each relation: sparse, dense or log_dense
# dense if Wij = 1 for all ij
# sparse if Wij = 1 if Xij>0
# log if link function = logistic
elif user == '' and item != '':
Xs_trn = [X_train, X_itemFeat]
Xs_tst = [X_test, None]
rc_schema = numpy.array([[0, 1], [1, 2]]) # 0=user, 1=item, 2=itemFeat
modes = ['sparse', feature_mat_type]
elif user != '' and item == '':
Xs_trn = [X_train, X_userFeat]
Xs_tst = [X_test, None]
rc_schema = numpy.array([[0, 1], [0, 2]]) # 0=user, 1=item, 2=userFeat
modes = ['sparse', feature_mat_type]
elif user == '' and item == '':
assert False, 'No user and item features.'
Xs_trn = [X_train]
Xs_tst = [X_test]
rc_schema = numpy.array([[0, 1]])
modes = ['sparse']
return [Xs_trn, Xs_tst, rc_schema, modes]
def get_config(Xs, rc_schema):
'''
get neccessary configurations of the given relation
---------------------
S = number of entity
Ns = number of instances for each entity
'''
assert(len(Xs)==len(rc_schema)), "rc_schema lenth must be the same as input data."
S = rc_schema.max() + 1
Ns = -1 * numpy.ones(S, int)
for i in range(len(Xs)):
ri = rc_schema[i, 0]
ci = rc_schema[i, 1]
[m, n] = Xs[i].shape
if Ns[ri] < 0:
Ns[ri] = m
else:
assert(Ns[ri] == m), "rc_schema does not match data."
if Ns[ci] < 0:
Ns[ci] = n
else:
assert(Ns[ci] == n), "rc_schema does not match data."
return [S, Ns]
def RMSE(X, Y):
'''
X is prediction, Y is ground truth
Both X and Y should be scipy.sparse.csc_matrix
'''
assert(X.size == Y.size and all(X.indices == Y.indices) and all(X.indptr == Y.indptr) and X.size > 0)
return numpy.sqrt(sum(pow(X.data - Y.data, 2)) / X.size)
def MAE(X, Y):
assert(X.size == Y.size and all(X.indices == Y.indices) and all(X.indptr == Y.indptr) and X.size > 0)
return sum(abs(X.data - Y.data)) / X.size
def check_modes(modes):
for mode in modes:
if mode != 'sparse' and mode != 'dense' and mode != 'log_dense':
assert False, 'Unrecognized mode: {}'.format(mode)
def string2list(input_string, num, sep='-'):
string_list = input_string.split(sep)
assert( len(string_list) == num ), 'argument alphas must be the same length as numbers of relations.'
return [float(x) for x in string_list]
def save_result(args, rmse):
if args.user != '' and args.item != '':
cmf_type = 'useritem'
elif args.user == '' and args.item != '':
cmf_type = 'item'
elif args.user != '' and args.item == '':
cmf_type = 'user'
elif args.user == '' and args.item == '':
cmf_type = 'none'
if args.out != '':
if os.path.exists(args.out) is False:
with open(args.out, 'w') as fp:
fp.write('type,k,reg,lr,tol,alphas,RMSE\n')
with open(args.out, 'a') as fp:
fp.write('{},{},{},{},{},{},{:.4f}\n'.format(cmf_type, args.k, args.reg, args.lr, args.tol, args.alphas, rmse))