-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathclassifier.py
136 lines (115 loc) · 5.24 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
#-*- coding:utf-8 -*-
from globalConfig import *
from sklearn import svm
from sklearn import metrics
from getVocab import getVocab
import os
import numpy as np
import pandas as pd
import random
import argparse
import time
def get_feature_vec(word_context):
'''get the feature vector of the word context
Args:
--------
word_context: a tuple of word context, with the first one to be the target word itself
Returns:
--------
feature_vector: a list-like feature vector from word context, with the dimension of WORD_NUM * 94
'''
# TODO: reduce the dimension
# the dimension of a word vector is 94
# a character's index is its ascii code minus 33
# concat each word together to get feature vector
feature_vec = []
for word in word_context:
word_vec = list([0.0 for i in range(94)])
for char in list(word):
word_vec[ord(char)-33] += 1.0 # (ascii - 33)
feature_vec += word_vec
return feature_vec
def data_loader(inputPath, trainingRate=0.5, rawlogPath="", rex=[], contextNum=[1,0]):
'''load data for classifier training
Args:
--------
inputPath: the path to input data directory
trainingRate: the rate to split train data and testing data, default 0.5
rawlogPath: the path to the rawlog file (maybe not in inputPath), default ""
rex:list of regular rules to clean log, default []
contextNum: list of the number of vocabulary to choose in the context, [preceding, succeeding], default [1,0]
Returns:
--------
train_data: train data vector, each line for a word
train_labels: train data labels, each line for a word label
testing_data: testing data vector, each line for a word
testing_labels: testing data labels, each line for a word label
'''
print("Data loaded from %s, trainning rate is %f" % (inputPath, trainingRate))
template_vocab = getVocab(inputPath, rawlogPath=rawlogPath, rex=rex, contextNum=contextNum)
vector_list = []
for key, value in template_vocab.items():
for plabel in value["template_vocab"]:
vector_list.append((get_feature_vec(plabel), [1.0]))
for nlabel in value["variable_vocab"]:
vector_list.append((get_feature_vec(nlabel), [-1.0]))
random.shuffle(vector_list)
total_num = len(vector_list)
divide_index = int(total_num*trainingRate)
train_data = np.array([i[0] for i in vector_list[:divide_index]], dtype=float)
train_labels = np.array([i[1] for i in vector_list[:divide_index]], dtype=float)
testing_data = np.array([i[0] for i in vector_list[divide_index:]], dtype=float)
testing_labels = np.array([i[1] for i in vector_list[divide_index:]], dtype=float)
print("train_data shape is", train_data.shape)
print("testing_data shape is", testing_data.shape)
return train_data, train_labels, testing_data, testing_labels
def SVM(train_data, train_labels, testing_data, testing_labels, test=True):
'''train SVM classifier
Args:
--------
train_data: train data vector, each line for a word
train_labels: train data labels, each line for a word label
testing_data: testing data vector, each line for a word
testing_labels: testing data labels, each line for a word label
test: whether test or not, default True
Returns:
--------
clf: SVM classifer
'''
print("SVM classifier")
clf = svm.LinearSVC(penalty='l2', tol=1e-4, C=1.0, dual=True, fit_intercept=True, intercept_scaling=1, class_weight="balanced", max_iter=1000000)
clf = clf.fit(train_data, train_labels.ravel())
if not test:
return clf
prediction = list(clf.predict(testing_data))
assert len(prediction) == len(testing_labels)
printLine()
print("accuracy: %0.3f"%metrics.accuracy_score(testing_labels, prediction))
print("recall: %0.3f"%metrics.recall_score(testing_labels, prediction))
print("f-score: %0.3f"%metrics.f1_score(testing_labels, prediction))
printLine()
return clf
if __name__ == "__main__":
# run __main__ function to evaluate classifier
# USAGE: ./classifier.py [-dataset {DATASET}] [-ratio {RATIO}] [-preceding {PRECEDING}] [-succeeding {SUCCEEDING}]
parser = argparse.ArgumentParser()
parser.add_argument('-dataset', help = DATASET_STR, type = str, default = '2kBGL')
parser.add_argument('-ratio', help = 'ratio to split training data', type = float, default = 0.5)
parser.add_argument('-preceding', help = 'preceding word number', type = int, default = -1)
parser.add_argument('-succeeding', help = 'succeeding word number', type = int, default = -1)
args = parser.parse_args()
dataset = args.dataset
ratio = args.ratio
if args.preceding < 0 or args.succeeding < 0:
preceding = ContextNum["preceding_word_num"]
succeeding = ContextNum["succeeding_word_num"]
else:
preceding = args.preceding
succeeding = args.succeeding
input_path = os.path.join(DATA_PATH, dataset)
t1 = time.time()
train_data, train_labels, testing_data, testing_labels = data_loader(input_path, trainingRate=ratio, rawlogPath=input_path, rex=regL[dataset], contextNum=[preceding, succeeding])
SVM(train_data, train_labels, testing_data, testing_labels, test=True)
t2 = time.time()
print("time: %0.3f"%(t2-t1))