-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
181 lines (146 loc) · 6.09 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"""
Main Program
Download gravatar SO user's pictures
Grab information about them on google and wikipedia
Compute some visual features
Store these information in a csv file
@author: Alexandre Bisiaux
"""
from google import GoogleSearch, GoogleImage
from wikipedia import Wikipedia
from downloader import Downloader # Downloader module (for the mailing list archive files)
from Queue import Queue # Multi-threading support for Downloader
from unicodeMagic import UnicodeReader, UnicodeWriter
from randomReader import RandomReader
from faceDetection import FaceDetector
import pictureUtils as picUtils
import time, os, sys, urllib2
"""
FEATURES SELECTION
"""
_VISUAL_FEATURES = True
_FACE = True
_MOST_COMMON_COLORS = True
_NBCOLORS = True
_FARTHEST_NEIGHBOR = True
_AVERAGE_SATURATION = True
_THRESHOLD_BRIGHTNESS = True
_GOOGLE = False
_WIKIPEDIA = False
"""
RANDOM SET
"""
_RANDOM = True
"""
Check if a so_hash correspond to a default gravatar picture
@param so_hash: Mail hash of the SO user
@return: True if it links to the default gravatar picture, False otherwise
"""
def isDefaultGravatarPic(so_hash):
url = 'http://www.gravatar.com/avatar/%s' % so_hash
try:
urllib2.urlopen("%s?d=404" % (url)) # throw an exception in case of default gravatar picture
return False
except Exception:
return True
"""
Main Program
"""
def main():
data = "../resources/SOusers-Mar13.csv" # File containing SO user dump
results = "../resources/features3.csv" # File where features will be stored
picPath = "../resources/SOpictures/" # Directory where pictures will be downloaded
fr = open(os.path.join(data), 'rb')
fw = open(os.path.join(results), 'ab')
if _RANDOM:
reader = RandomReader(fr)
else:
reader = UnicodeReader(fr)
writer = UnicodeWriter(fw)
queue = Queue()
if _FACE:
faceDetector = FaceDetector()
threads = []
SOhashes = {} # Dictionary of user's hashes
# Use multiple threads to download and get information
for i in xrange(10):
threads.append(Downloader(queue))
threads[-1].start()
idx = 0
size = 4500 # Number of subjects
for row in reader:
if idx < size:
so_uid = row[0]
so_hash = row[2]
if(not (SOhashes.has_key(so_hash))):
SOhashes[so_hash] = so_uid
if(not isDefaultGravatarPic(so_hash)):
data = [so_uid]
if _VISUAL_FEATURES:
# Download picture
filepath = os.path.join('%s%d.jpg' % (picPath,int(so_uid)))
if not os.path.isfile(filepath):
queue.put(('http://www.gravatar.com/avatar/%s' % so_hash, filepath))
time.sleep(2)
# Load picture
pic = picUtils.loadPicture(filepath)
if _FACE:
if faceDetector.isFrontFace(pic) or faceDetector.isProfileFace(pic):
data.append(str(True))
else:
data.append(str(False))
if _MOST_COMMON_COLORS:
_, f1, _, f2 = picUtils.mostCommonColor(pic)
data.append(str(f1 + f2))
if _NBCOLORS:
data.append(str(picUtils.getNbOfColors(pic)))
if _FARTHEST_NEIGHBOR:
F1 = picUtils.farthestNeighborMetric(pic, 10)
F2 = picUtils.farthestNeighborMetric(pic, 200)
data.append(str(F1))
data.append(str(F2))
if F1 != 0:
data.append(str(F2/F1))
else:
data.append('?')
if _AVERAGE_SATURATION:
data.append(str(picUtils.avgSaturation(pic)))
if _THRESHOLD_BRIGHTNESS:
data.append(str(picUtils.threBrightness(pic, 0.2)))
if _GOOGLE:
gi = GoogleImage('http://www.gravatar.com/avatar/%s' % so_hash)
bestGuess = gi.getBestGuess()
if bestGuess:
bestGuess = bestGuess.encode('utf8')
data.append(bestGuess)
if _WIKIPEDIA:
gs = GoogleSearch("%s site:en.wikipedia.org" % bestGuess)
wikiTitlePage = gs.getWikipediaTitlePage()
if wikiTitlePage:
wiki = Wikipedia(wikiTitlePage)
wiki.categoryGraph(4)
nbCats = 10
i = 0
cats = wiki.sortGraphByDegree()
while i<nbCats and i < len(cats):
data.append(str(cats[i]))
i += 1
# Write all information collected in the csv file
try:
print data
writer.writerow(data)
idx += 1
except:
print "Error with data"
else:
break
fr.close()
fw.close()
# If here, download finished. Stop threads
for i in xrange(10):
queue.put((None, None))
if __name__ == "__main__":
t = time.time()
sys.exit(main())
elapsed = time.time() - t
print "Time elapsed = %s" % elapsed