Skip to content

Commit

Permalink
Clean up tests
Browse files Browse the repository at this point in the history
  • Loading branch information
gdanezis committed Nov 10, 2014
1 parent bd6a7f7 commit 6f9a1e5
Showing 1 changed file with 27 additions and 25 deletions.
52 changes: 27 additions & 25 deletions code/malware.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ def visualizedistances(data, figname=None):
plt.savefig(figname, bbox_inches='tight')


def selectsubsets(data, features=200, training=400, fraction_negative = 0.5):
def selectsubsets(data, features=200, training=400, testing=100, fraction_negative = 0.5):
D, L, N = data

## First identify the indexes of positives and negatives
Expand All @@ -355,9 +355,11 @@ def selectsubsets(data, features=200, training=400, fraction_negative = 0.5):
feature_set = np.hstack((negatives[:features], positives[:features]))
training_set = np.hstack((negatives[features:Neg_training], positives[features:Neg_training]))

test_size = min(len(negatives[Neg_training:]), len(positives[Neg_training:]))
test_size = testing # min(len(negatives[Neg_training:]), len(positives[Pos_training:]))

test_set = np.hstack((negatives[Neg_training:Neg_training+test_size], positives[Pos_training:Pos_training+test_size]))

assert len(test_set) == 2 * test_size
print "Feature size: %s Training size: %s Test size: %s" % (len(feature_set), len(training_set), len(test_set))

training_records = D[training_set, :]
Expand Down Expand Up @@ -387,21 +389,29 @@ def selectsubsets(data, features=200, training=400, fraction_negative = 0.5):
class TestSeq(unittest.TestCase):

def setUp(self):
self.repeats = 10
self.trees = 100
self.repeats = 30 # 10
self.trees = 400 # 400
self.features = 100
self.training = 400
self.training = 300
self.testing = 300
self.bias = 0.5

def test_visualize(self):
data = read_data('../data/filelabels.csv', '../data/ncdvals.csv')

D, L, N = data
assert D.shape == (2000, 2000)
assert len(L) == 2000
assert len(L[0]) == 1
assert len(N) == 2000

visualizedistances(data, '../scratch/distances.png')

def classifier_test(self, data, D, L, N, name):
repeats = self.repeats
max_ACC, TP, FP = [], [], []
for r in xrange(repeats):
feature_set, training_data, test_data = selectsubsets(data, features=self.features, training=self.training, fraction_negative=self.bias)
feature_set, training_data, test_data = selectsubsets(data, features=self.features, training=self.training, testing=self.testing, fraction_negative=self.bias)
(training_set, training_records, training_labels) = training_data
(test_set, test_records, test_labels) = test_data

Expand All @@ -426,23 +436,12 @@ def classifier_test(self, data, D, L, N, name):

def test_malwareanalysis(self):
data = read_data('../data/filelabels.csv', '../data/ncdvals.csv')

D, L, N = data
assert D.shape == (2000, 2000)
assert len(L) == 2000
assert len(L[0]) == 1
assert len(N) == 2000

self.classifier_test(data, D, L, N, name="MATRIX_ONLY")

def test_malwareanalysis_unbalanced(self):
data = read_data('../data/filelabels.csv', '../data/ncdvals.csv')

D, L, N = data
assert D.shape == (2000, 2000)
assert len(L) == 2000
assert len(L[0]) == 1
assert len(N) == 2000

self.bias = 0.9
self.classifier_test(data, D, L, N, name="MATRIX_ONLY_BIASED")
Expand All @@ -451,22 +450,25 @@ def test_malwareanalysis_compress(self):
data = read_data('../data/filelabels2.csv', '../data/ncdvals.csv')

D, L, N = data
assert D.shape == (2000, 2000)
assert len(L) == 2000
assert len(L[0]) == 2
assert len(N) == 2000


self.classifier_test(data, D, L, N, name="MATRIX_COMPRESS")

def test_compress_only(self):
def test_malwareanalysis_compress_unbalanced(self):
data = read_data('../data/filelabels2.csv', '../data/ncdvals.csv')

D, L, N = data
assert D.shape == (2000, 2000)
assert len(L) == 2000
assert len(L[0]) == 2
assert len(N) == 2000

self.bias = 0.9
self.classifier_test(data, D, L, N, name="MATRIX_COMPRESS")


def test_compress_only(self):
data = read_data('../data/filelabels2.csv', '../data/ncdvals.csv')

D, L, N = data

self.features = 0
self.classifier_test(data, D, L, N, name="COMPRESS_ONLY")

Expand Down

0 comments on commit 6f9a1e5

Please sign in to comment.