-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrainingSet.py
95 lines (82 loc) · 2.59 KB
/
trainingSet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
Build the sample set for train and test classifiers
@author: Alexandre Bisiaux
"""
from randomReader import RandomReader
from unicodeMagic import UnicodeWriter
import os, re, csv
"""
Convert a csv file into a tab file
@param file: path to the csv file
"""
class csvToTab:
def __init__(self, file):
with open(file,'rb') as fin:
cr = csv.reader(fin, delimiter=';')
filecontents = [line for line in cr]
new = file.replace("csv", "tab")
with open(new,'wb') as fou:
cw = csv.writer(fou, delimiter='\t')
cw.writerows(filecontents)
"""
Build the sample set
@param inputFile: path to the SO users data
@param sampleFile: path to the output file
"""
def buildSampleSet(inputFile, sampleFile):
f = open(os.path.join(inputFile), "rb")
f1 = open(os.path.join(sampleFile), "wb")
reader = RandomReader(f)
writer = UnicodeWriter(f1)
nbRows = 0
categories = []
countPages = []
for row in reader:
nbRows += 1
for cat in row[15:25]:
if cat != "?":
if not cat in categories:
categories.append(cat)
countPages.append(0)
data = []
for row in reader:
line = []
for d in row[0:15]:
line.append(d)
for ind, cat in enumerate(categories):
if cat in row[15:25]:
countPages[ind] += 1
line.append(str(1))
else:
line.append(str('?'))
data.append(line)
i = 0
filteredCategories = []
for cat, cpt in zip(categories, countPages):
if cpt < 6: # Filter categories and keep only those which have more than 6 subjects in it
ind = 15 + i
for d in data:
d.pop(ind)
else:
filteredCategories.append(cat)
i += 1
# Header
header = ["uid", "cl1", "cl2", "cl3", "cl4", "cl5", "face",
"fCols", "nbCols", "f1", "f2", "f3", "s",
"b", "bestGuess"]
domain = ["c", "d", "d", "d", "d", "d", "d", "c", "c", "c", "c", "c", "c", "c", "string"]
attribut = ["m", "c", "m", "m", "m", "m"]
for cat in filteredCategories:
header.append(cat)
domain.append("d")
writer.writerow(header)
writer.writerow(domain)
writer.writerow(attribut)
i = 0
for row in data:
writer.writerow(row)
i += 1
f.close()
f1.close()
buildSampleSet("../resources/4.5k_sample.csv", "../resources/sampleSet.csv")
csvToTab("../resources/sampleSet.csv")