From 6f9a1e5d97c00cebb9111f27a543aba7693272f0 Mon Sep 17 00:00:00 2001
From: George D <georged@gmail.com>
Date: Mon, 10 Nov 2014 10:59:12 +0000
Subject: [PATCH] Clean up tests

---
 code/malware.py | 52 +++++++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/code/malware.py b/code/malware.py
index 87013fb..f17d4e8 100644
--- a/code/malware.py
+++ b/code/malware.py
@@ -339,7 +339,7 @@ def visualizedistances(data, figname=None):
     plt.savefig(figname, bbox_inches='tight')
 
 
-def selectsubsets(data, features=200, training=400, fraction_negative = 0.5):
+def selectsubsets(data, features=200, training=400, testing=100, fraction_negative = 0.5):
     D, L, N = data
 
     ## First identify the indexes of positives and negatives
@@ -355,9 +355,11 @@ def selectsubsets(data, features=200, training=400, fraction_negative = 0.5):
     feature_set = np.hstack((negatives[:features], positives[:features]))
     training_set = np.hstack((negatives[features:Neg_training], positives[features:Neg_training]))
 
-    test_size = min(len(negatives[Neg_training:]), len(positives[Neg_training:]))
+    test_size = testing # min(len(negatives[Neg_training:]), len(positives[Pos_training:]))
+    
     test_set = np.hstack((negatives[Neg_training:Neg_training+test_size], positives[Pos_training:Pos_training+test_size]))
 
+    assert len(test_set) == 2 * test_size
     print "Feature size: %s Training size: %s Test size: %s" % (len(feature_set), len(training_set), len(test_set))
 
     training_records = D[training_set, :]
@@ -387,21 +389,29 @@ def selectsubsets(data, features=200, training=400, fraction_negative = 0.5):
 class TestSeq(unittest.TestCase):
 
     def setUp(self):
-        self.repeats = 10
-        self.trees = 100
+        self.repeats = 30 # 10
+        self.trees = 400 # 400
         self.features = 100
-        self.training = 400
+        self.training = 300
+        self.testing = 300
         self.bias = 0.5
 
     def test_visualize(self):
         data = read_data('../data/filelabels.csv', '../data/ncdvals.csv')
+
+        D, L, N = data
+        assert D.shape == (2000, 2000)
+        assert len(L) == 2000
+        assert len(L[0]) == 1
+        assert len(N) == 2000
+
         visualizedistances(data, '../scratch/distances.png')
 
     def classifier_test(self, data, D, L, N, name):
         repeats = self.repeats
         max_ACC, TP, FP = [], [], []
         for r in xrange(repeats):
-            feature_set, training_data, test_data = selectsubsets(data, features=self.features, training=self.training, fraction_negative=self.bias)
+            feature_set, training_data, test_data = selectsubsets(data, features=self.features, training=self.training, testing=self.testing, fraction_negative=self.bias)
             (training_set, training_records, training_labels) = training_data
             (test_set, test_records, test_labels) = test_data
 
@@ -426,23 +436,12 @@ def classifier_test(self, data, D, L, N, name):
 
     def test_malwareanalysis(self):
         data = read_data('../data/filelabels.csv', '../data/ncdvals.csv')
-
         D, L, N = data
-        assert D.shape == (2000, 2000)
-        assert len(L) == 2000
-        assert len(L[0]) == 1
-        assert len(N) == 2000
-
         self.classifier_test(data, D, L, N, name="MATRIX_ONLY")
 
     def test_malwareanalysis_unbalanced(self):
         data = read_data('../data/filelabels.csv', '../data/ncdvals.csv')
-
         D, L, N = data
-        assert D.shape == (2000, 2000)
-        assert len(L) == 2000
-        assert len(L[0]) == 1
-        assert len(N) == 2000
 
         self.bias = 0.9
         self.classifier_test(data, D, L, N, name="MATRIX_ONLY_BIASED")
@@ -451,22 +450,25 @@ def test_malwareanalysis_compress(self):
         data = read_data('../data/filelabels2.csv', '../data/ncdvals.csv')
 
         D, L, N = data
-        assert D.shape == (2000, 2000)
-        assert len(L) == 2000
         assert len(L[0]) == 2
-        assert len(N) == 2000
-
+        
         self.classifier_test(data, D, L, N, name="MATRIX_COMPRESS")
 
-    def test_compress_only(self):
+    def test_malwareanalysis_compress_unbalanced(self):
         data = read_data('../data/filelabels2.csv', '../data/ncdvals.csv')
 
         D, L, N = data
-        assert D.shape == (2000, 2000)
-        assert len(L) == 2000
         assert len(L[0]) == 2
-        assert len(N) == 2000
+        
+        self.bias = 0.9
+        self.classifier_test(data, D, L, N, name="MATRIX_COMPRESS")
 
+
+    def test_compress_only(self):
+        data = read_data('../data/filelabels2.csv', '../data/ncdvals.csv')
+
+        D, L, N = data
+        
         self.features = 0
         self.classifier_test(data, D, L, N, name="COMPRESS_ONLY")