forked from VowpalWabbit/vowpal_wabbit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvw-csv2bin
executable file
·125 lines (81 loc) · 2.96 KB
/
vw-csv2bin
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/python
# 2013 Eric Whyne
# http://www.datamungeblog.com
import re
import sys, getopt
infilename = ''
outfilename = ''
pregex = 'Y(es)?|T(rue)?|\+?1'
nregex = 'No?|F(alse)?|0|-1'
category_index = 0
delimeter = ','
def printhelp():
print "\n" + sys.argv[0] + """ converts csv data into vw binary classifier training data.
Options:
-h Print this help.
-i <filename> input file, if not defined will use stdin
-o <filename> output file, if not defined will use stdout
-p <regex> regex identifying positive side of binary classification
if not defined will use 'Y(es)?|T(rue)?|+?1'
-n <regex> regex identifying negative side of binary classification
if not defined will use 'No?|F(alse)?|0|-1'
-c <integer> csv column which binary classification resides
if not defined will use 0
-d <delimeter> specifies boundaries used to separate csv columns
if not defined will use ','
Examples:
cat data.csv | ./vw-csv2bin -c 14 -p '>' -n '<' > training.vw
./vw-csv2bin -i data.csv -o training.vw -d '\\t' -c 14 -p '>' -n '<'
"""
try:
opts, args = getopt.getopt(sys.argv[1:],"hi:o:p:n:c:d:")
except getopt.GetoptError:
printhelp()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
printhelp()
sys.exit()
elif opt == '-i':
infilename = arg
elif opt == '-o':
outfilename = arg
elif opt == '-p':
pregex = arg
elif opt == '-n':
nregex = arg
elif opt == '-c':
category_index = int(arg)
elif opt == '-d':
delimeter = arg
if infilename:
infile = open (infilename,'r')
else:
infile = sys.stdin
if outfilename:
outfile = open (outfilename,'w')
for line in infile:
line = re.sub('\||:|\s', '', line) # Remove vertical bar, colon, space, and newline; unsupported by vw file format
data = line.split(delimeter)
category = data.pop(category_index)
if re.search(nregex, category): # regex for negative category
category = "-1"
elif re.search(pregex, category): # regex for positive category
category = '1'
else:
sys.exit("Regex did not match a record, exiting.\nPostive Regex: " + pregex + "\nNegative Regex: "+ nregex + "\nRecord:\n" + line)
outline = category + " | "
colnum = 0
for col in data:
colstr = str(colnum)
#col = re.sub(r'\s','',col) # remove all whitespace
if re.search('^(\-)?[0-9.]*$', col) or re.search('^(\-)?[0-9.]*e(\+|-)[0-9.]*$', col): # If the feature is a number, then give it a label
outline = outline + "f" + colstr + ":" + col + " "
else: # If the feature is a string, then let vw handle it directly
outline = outline + col + ' '
colnum = colnum + 1
outline = outline + "\n"
if outfilename:
outfile.write(outline)
else:
sys.stdout.write(outline)