6
6
# It is made available under the MIT License
7
7
8
8
import os
9
+ import sys
9
10
import collections
10
11
import csv
11
12
import json
@@ -57,7 +58,7 @@ def load_sanders_data(dirname=".", line_count=-1):
57
58
try :
58
59
tweet = json .load (open (tweet_fn , "r" ))
59
60
except IOError :
60
- print ("Tweet '%s' not found. Skip." % tweet_fn )
61
+ print (( "Tweet '%s' not found. Skip." % tweet_fn ) )
61
62
continue
62
63
63
64
if 'text' in tweet and tweet ['user' ]['lang' ] == "en" :
@@ -84,14 +85,14 @@ def plot_pr(auc_score, name, phase, precision, recall, label=None):
84
85
pylab .title ('P/R curve (AUC=%0.2f) / %s' % (auc_score , label ))
85
86
filename = name .replace (" " , "_" )
86
87
pylab .savefig (os .path .join (CHART_DIR , "pr_%s_%s.png" %
87
- (filename , phase )), bbox_inches = "tight" )
88
+ (filename , phase )), bbox_inches = "tight" )
88
89
89
90
90
91
def show_most_informative_features (vectorizer , clf , n = 20 ):
91
92
c_f = sorted (zip (clf .coef_ [0 ], vectorizer .get_feature_names ()))
92
- top = zip (c_f [:n ], c_f [:- (n + 1 ):- 1 ])
93
+ top = list ( zip (c_f [:n ], c_f [:- (n + 1 ):- 1 ]) )
93
94
for (c1 , f1 ), (c2 , f2 ) in top :
94
- print "\t %.4f\t %-15s\t \t %.4f\t %-15s" % (c1 , f1 , c2 , f2 )
95
+ print ( "\t %.4f\t %-15s\t \t %.4f\t %-15s" % (c1 , f1 , c2 , f2 ) )
95
96
96
97
97
98
def plot_log ():
@@ -119,7 +120,7 @@ def plot_feat_importance(feature_names, clf, name):
119
120
inds = np .argsort (coef )
120
121
f_imp = f_imp [inds ]
121
122
coef = coef [inds ]
122
- xpos = np .array (range (len (coef )))
123
+ xpos = np .array (list ( range (len (coef ) )))
123
124
pylab .bar (xpos , coef , width = 1 )
124
125
125
126
pylab .title ('Feature importance for %s' % (name ))
@@ -181,8 +182,13 @@ def plot_bias_variance(data_sizes, train_errors, test_errors, name):
181
182
def load_sent_word_net ():
182
183
183
184
sent_scores = collections .defaultdict (list )
185
+ sentiwordnet_path = os .path .join (DATA_DIR , "SentiWordNet_3.0.0_20130122.txt" )
184
186
185
- with open (os .path .join (DATA_DIR , "SentiWordNet_3.0.0_20130122.txt" ), "r" ) as csvfile :
187
+ if not os .path .exists (sentiwordnet_path ):
188
+ print ("Please download SentiWordNet_3.0.0 from http://sentiwordnet.isti.cnr.it/download.php, extract it and put it into the data directory" )
189
+ sys .exit (1 )
190
+
191
+ with open (sentiwordnet_path , 'r' ) as csvfile :
186
192
reader = csv .reader (csvfile , delimiter = '\t ' , quotechar = '"' )
187
193
for line in reader :
188
194
if line [0 ].startswith ("#" ):
@@ -200,7 +206,7 @@ def load_sent_word_net():
200
206
term = term .replace ("-" , " " ).replace ("_" , " " )
201
207
key = "%s/%s" % (POS , term .split ("#" )[0 ])
202
208
sent_scores [key ].append ((float (PosScore ), float (NegScore )))
203
- for key , value in sent_scores .iteritems ():
209
+ for key , value in sent_scores .items ():
204
210
sent_scores [key ] = np .mean (value , axis = 0 )
205
211
206
212
return sent_scores
0 commit comments