Skip to content

Commit

Permalink
ML improvements and bugs fixes
Browse files Browse the repository at this point in the history
ML: Do not use various classifiers, a decision tree classifier is more than enough.
BUG: Do not read twice the data when training model.
CORE: Calculate the probability of the match being a good one or not instead of a answering a hardcoded 1 or 0.
  • Loading branch information
joxeankoret committed Dec 7, 2018
1 parent 8417984 commit ac7010d
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 15 deletions.
Binary file modified ml/clf.pkl
Binary file not shown.
27 changes: 16 additions & 11 deletions ml/pigaios_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,7 @@ def warn(*args, **kwargs):
# very interesting, but I'll leave it here.
#
ML_CLASSIFIERS = [
(tree.DecisionTreeClassifier, "Decision Tree Classifier", "gini"),
(naive_bayes.BernoulliNB, "Bernoulli Naive Bayes", 1.0),
(ensemble.GradientBoostingClassifier, "Gradient Boosting Classifier", "deviance"),
(ensemble.RandomForestClassifier, "Random Forest Classifier", 10),
(tree.DecisionTreeClassifier, "Decision Tree Classifier", []),
]

#-------------------------------------------------------------------------------
Expand Down Expand Up @@ -114,12 +111,16 @@ def predict(self, X):
class CPigaiosMultiClassifier(object):
def __init__(self, random_state=None):
self.clfs = {}
for classifier, name, arg in ML_CLASSIFIERS:
for classifier, name, args in ML_CLASSIFIERS:
has_seed = 'random_state' in dir(classifier.__init__.im_class())
if has_seed:
self.clfs[name] = classifier(arg, random_state=random_state)
self.clfs[name] = classifier(random_state=random_state)
for arg_name, arg_value in args:
setattr(self.clfs[name], arg_name, arg_value)
else:
self.clfs[name] = classifier(arg)
self.clfs[name] = classifier()
for arg_name, arg_value in args:
setattr(self.clfs[name], arg_name, arg_value)

def fit(self, X, y):
threads = []
Expand Down Expand Up @@ -151,21 +152,25 @@ def predict(self, input_val):

return val

def predict_proba(self, input_val):
ret = []
for clf in self.clfs.values():
ret.append(clf.predict_proba(input_val)[0][1])
return sum(ret) / len(ret)

#-------------------------------------------------------------------------------
class CPigaiosClassifier:
def __init__(self):
self.X = []
self.y = []
self.clf = None
self.criterion = "mse"

self.dt_type = tree.DecisionTreeRegressor
self.criterion = "gini"
self.dt_type = tree.DecisionTreeClassifier

def load_data(self, dataset="dataset.csv"):
if len(self.X) > 0:
return self.X, self.y

lines = open(dataset, "rb").readlines()
x_values = []
y_values = []
with open(dataset, "r") as f:
Expand Down
5 changes: 1 addition & 4 deletions sourceimp_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,7 @@ def compare_functions(self, src_id, bin_id, heuristic):
self.ml_model = self.ml_classifier.load_model()

line = map(float, line)
ml = self.ml_model.predict(np.array(line).reshape(1, -1))
ml = float(ml)
if round(ml) == 0.0:
ml = 0
ml = self.ml_model.predict_proba(np.array(line).reshape(1, -1))

fields = COMPARE_FIELDS
cur = self.db.cursor()
Expand Down

0 comments on commit ac7010d

Please sign in to comment.