From 6f9a1e5d97c00cebb9111f27a543aba7693272f0 Mon Sep 17 00:00:00 2001 From: George D <georged@gmail.com> Date: Mon, 10 Nov 2014 10:59:12 +0000 Subject: [PATCH] Clean up tests --- code/malware.py | 52 +++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/code/malware.py b/code/malware.py index 87013fb..f17d4e8 100644 --- a/code/malware.py +++ b/code/malware.py @@ -339,7 +339,7 @@ def visualizedistances(data, figname=None): plt.savefig(figname, bbox_inches='tight') -def selectsubsets(data, features=200, training=400, fraction_negative = 0.5): +def selectsubsets(data, features=200, training=400, testing=100, fraction_negative = 0.5): D, L, N = data ## First identify the indexes of positives and negatives @@ -355,9 +355,11 @@ def selectsubsets(data, features=200, training=400, fraction_negative = 0.5): feature_set = np.hstack((negatives[:features], positives[:features])) training_set = np.hstack((negatives[features:Neg_training], positives[features:Neg_training])) - test_size = min(len(negatives[Neg_training:]), len(positives[Neg_training:])) + test_size = testing # min(len(negatives[Neg_training:]), len(positives[Pos_training:])) + test_set = np.hstack((negatives[Neg_training:Neg_training+test_size], positives[Pos_training:Pos_training+test_size])) + assert len(test_set) == 2 * test_size print "Feature size: %s Training size: %s Test size: %s" % (len(feature_set), len(training_set), len(test_set)) training_records = D[training_set, :] @@ -387,21 +389,29 @@ def selectsubsets(data, features=200, training=400, fraction_negative = 0.5): class TestSeq(unittest.TestCase): def setUp(self): - self.repeats = 10 - self.trees = 100 + self.repeats = 30 # 10 + self.trees = 400 # 400 self.features = 100 - self.training = 400 + self.training = 300 + self.testing = 300 self.bias = 0.5 def test_visualize(self): data = read_data('../data/filelabels.csv', '../data/ncdvals.csv') + + D, L, N = data + assert D.shape == (2000, 2000) + assert len(L) == 2000 + assert len(L[0]) == 1 + assert len(N) == 2000 + visualizedistances(data, '../scratch/distances.png') def classifier_test(self, data, D, L, N, name): repeats = self.repeats max_ACC, TP, FP = [], [], [] for r in xrange(repeats): - feature_set, training_data, test_data = selectsubsets(data, features=self.features, training=self.training, fraction_negative=self.bias) + feature_set, training_data, test_data = selectsubsets(data, features=self.features, training=self.training, testing=self.testing, fraction_negative=self.bias) (training_set, training_records, training_labels) = training_data (test_set, test_records, test_labels) = test_data @@ -426,23 +436,12 @@ def classifier_test(self, data, D, L, N, name): def test_malwareanalysis(self): data = read_data('../data/filelabels.csv', '../data/ncdvals.csv') - D, L, N = data - assert D.shape == (2000, 2000) - assert len(L) == 2000 - assert len(L[0]) == 1 - assert len(N) == 2000 - self.classifier_test(data, D, L, N, name="MATRIX_ONLY") def test_malwareanalysis_unbalanced(self): data = read_data('../data/filelabels.csv', '../data/ncdvals.csv') - D, L, N = data - assert D.shape == (2000, 2000) - assert len(L) == 2000 - assert len(L[0]) == 1 - assert len(N) == 2000 self.bias = 0.9 self.classifier_test(data, D, L, N, name="MATRIX_ONLY_BIASED") @@ -451,22 +450,25 @@ def test_malwareanalysis_compress(self): data = read_data('../data/filelabels2.csv', '../data/ncdvals.csv') D, L, N = data - assert D.shape == (2000, 2000) - assert len(L) == 2000 assert len(L[0]) == 2 - assert len(N) == 2000 - + self.classifier_test(data, D, L, N, name="MATRIX_COMPRESS") - def test_compress_only(self): + def test_malwareanalysis_compress_unbalanced(self): data = read_data('../data/filelabels2.csv', '../data/ncdvals.csv') D, L, N = data - assert D.shape == (2000, 2000) - assert len(L) == 2000 assert len(L[0]) == 2 - assert len(N) == 2000 + + self.bias = 0.9 + self.classifier_test(data, D, L, N, name="MATRIX_COMPRESS") + + def test_compress_only(self): + data = read_data('../data/filelabels2.csv', '../data/ncdvals.csv') + + D, L, N = data + self.features = 0 self.classifier_test(data, D, L, N, name="COMPRESS_ONLY")