This malware detector uses the strings-command and the Feature Hasher to create a dataset of malware and benignware. The dataset is split up into training and test and we use a (not so deep) deep neural network to classify.
Thus, we solve the task from chapter 9 with deep learning now :)
# These are the imports we are going to need
import subprocess
import os
import numpy
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split
# Create the dataset using the files in data
benignware = os.listdir('data/benignware') # <---------- CHANGE PATH HERE
malware = os.listdir('data/malware') # <---------------- CHANGE PATH HERE
hasher = FeatureHasher(1000) # We initialize the featurehasher using 1,000 features
def extract_strings(filepath):
'''This methods extracts the strings from a file using the strings command in unix os'''
strings = subprocess.Popen(['strings', filepath], stdout=subprocess.PIPE).communicate()[0].decode('utf-8').split('\n')
return strings
benign_strings = [extract_strings('data/benignware/' + benignware[i]) for i in range(len(benignware))] # All strings from benignfiles
malware_strings = [extract_strings('data/malware/' + malware[i]) for i in range(len(malware))] # All strings from malwarefiles
# This creates the dataset by using the FeatureHasher to squash the strings of the file into a featuremap
# The code is mostly taken from the malware datascience book
benign_features = [] # This list later has all the benign_featuremaps
for bs in benign_strings:
# store string features in dictionary form
string_features = {}
for string in bs:
string_features[string] = 1
# hash the features using the hashing trick
hashed_features = hasher.transform([string_features])
# do some data munging to get the feature array
hashed_features = hashed_features.todense()
hashed_features = numpy.asarray(hashed_features)
hashed_features = hashed_features[0]
benign_features.extend([hashed_features])
malware_features = []
for ms in malware_strings:
# store string features in dictionary form
string_features = {}
for string in ms:
string_features[string] = 1
# hash the features using the hashing trick
hashed_features = hasher.transform([string_features])
# do some data munging to get the feature array
hashed_features = hashed_features.todense()
hashed_features = numpy.asarray(hashed_features)
hashed_features = hashed_features[0]
malware_features.extend([hashed_features])
# Let's create a trainingset
X = benign_features + malware_features
Y = [1 for _ in range(len(benignware))] + [0 for _ in range(len(malware))]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
# Let's print the statistics (showing that we have more benignware than malware)
# It already shows, that we should use some kind of weighting in our algorithms later, to deal with the skewed data
print('Statistics:')
print('Benign-Files:', len(benignware))
print('Malware-Files:', len(malware))
Statistics:
Benign-Files: 991
Malware-Files: 428
So after we created our dataset, we finally can start training. Firstly, we create new numpy arrays to be used by keras. We then define our model
X_trainN = numpy.array(X_train)
Y_trainN = numpy.array(Y_train)
X_testN = numpy.array(X_test)
Y_testN = numpy.array(Y_test)
import keras
from keras.layers import Dense, Activation, Dropout
from keras.models import Sequential
model = Sequential()
model.add(Dense(10, input_dim=1000))
model.add(Activation('relu'))
model.add(Dropout(3))
model.add(Dense(10, input_dim=1000))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
model.fit(X_trainN, Y_trainN, epochs=10, batch_size=32)
print('Score on unseen data:', model.evaluate(X_testN, Y_testN, batch_size=32), '(corresponding to)', *model.metrics_names)
Using TensorFlow backend.
Epoch 1/10
993/993 [==============================] - 0s 234us/step - loss: 0.6076 - acc: 0.7593
Epoch 2/10
993/993 [==============================] - 0s 62us/step - loss: 0.3291 - acc: 0.9053
Epoch 3/10
993/993 [==============================] - 0s 68us/step - loss: 0.1958 - acc: 0.9486
Epoch 4/10
993/993 [==============================] - 0s 51us/step - loss: 0.1247 - acc: 0.9688
Epoch 5/10
993/993 [==============================] - 0s 58us/step - loss: 0.0750 - acc: 0.9869
Epoch 6/10
993/993 [==============================] - 0s 54us/step - loss: 0.0456 - acc: 0.9960
Epoch 7/10
993/993 [==============================] - 0s 68us/step - loss: 0.0276 - acc: 0.9970
Epoch 8/10
993/993 [==============================] - 0s 62us/step - loss: 0.0172 - acc: 0.9970
Epoch 9/10
993/993 [==============================] - 0s 64us/step - loss: 0.0108 - acc: 0.9980
Epoch 10/10
993/993 [==============================] - 0s 63us/step - loss: 0.0077 - acc: 0.9980
426/426 [==============================] - 0s 112us/step
Score on unseen data: [0.4091785943196833, 0.9272300469483568] (corresponding to) loss acc
So even this simple networks has an accuracy of above 90% for the malware detection :)