-
Notifications
You must be signed in to change notification settings - Fork 44
/
Copy pathget_dates.py
executable file
·129 lines (118 loc) · 5.46 KB
/
get_dates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/python3
"""
This script reads the resources.arsc files in the malicous_apk
and benign_apk folders and copies the modified file dates into a
JSON file for later analysis
The output data format is as follows:
{"features": ["1222000000_to_1222111111", ...],
"apps": {"999eca2457729e371355aea5faa38e14.apk": {"vector": [0,0,0,1], "malicious": [0,1]}, ...}}
"""
import os
import json
import glob
import time
import random
from configparser import ConfigParser
__author__='mkkeffeler'
def main():
config = ConfigParser()
config.read('config.ini')
NUM_DATE_BUCKETS = config.getint('AMA', 'NUM_DATE_BUCKETS')
date_buckets = [] # list of strings naming each date range used in the dataset
app_date_map = {} # mapping from android app names to lists of dates
app_malicious_map = {} # mapping from android app names to 1 or 0 for malware or goodware
apps_per_bucket = {} # number of apps in each date range
relevant_buckets = [] # subset of buckets that will be used (based on NUM_DATE_BUCKETS)
all_apps = {} # mapping combining app_date_map and app_malicious_map using bits
apps_found_per_bucket = {} # number of apps added to all_apps for each bucket
root_dir = os.getcwd()
num_apps_before = 0
num_apps_after = 0
num_file_not_found = 0
fnfe = open('file_not_found_error', 'w')
for i, directory in enumerate(['benign_apk', 'malicious_apk']):
os.chdir(directory)
for filename in glob.glob('*.apk'):
#print('Processing ' + filename)
try:
os.chdir(filename[:-4])
if os.path.exists('classes.dex'):
mtime = os.stat('classes.dex')
else:
mtime = os.stat('resources.arsc')
if mtime.st_mtime < 1222000000:
num_apps_before += 1
if mtime.st_mtime > time.time():
num_apps_after += 1
except FileNotFoundError:
num_file_not_found += 1
fnfe.write(filename + '\n')
os.chdir(os.path.join(root_dir, directory))
continue
app_date_map[filename] = int(mtime.st_mtime)
app_name = filename
# make a one-hot bit vector of length 2. 1st bit set if malicious, otherwise 2nd bit
app_malicious_map[app_name] = [1,0] if i else [0,1]
os.chdir(os.pardir)
os.chdir(root_dir)
fnfe.close()
# Android was released Sept. 23, 2008
startdate = 1222000000
secondsinmonth = 60 * 60 * 24 * 28
enddate = startdate + secondsinmonth
while True:
date_buckets.append(str(startdate)+"_to_"+str(enddate))
# Apps can't have been made in the future
if(enddate >= time.time()):
break
startdate = enddate
enddate = startdate + secondsinmonth
# Count the number of apps per date range so we can ensure there's an equal number in each
for app_name in app_date_map:
for bucket in date_buckets:
mtime = app_date_map[app_name]
startdate = int(bucket.split("_to_")[0])
enddate = int(bucket.split("_to_")[1])
if (startdate <= mtime) and (mtime < enddate):
if bucket not in apps_per_bucket:
apps_per_bucket[bucket] = app_malicious_map[app_name]
else:
if app_malicious_map[app_name][0] == 1:
apps_per_bucket[bucket][0] += 1
else:
apps_per_bucket[bucket][1] += 1
break
with open('apps_per_bucket.json', 'w') as f:
json.dump(apps_per_bucket, f)
relevant_buckets = sorted(apps_per_bucket, key=lambda bucket: min(apps_per_bucket[bucket]), reverse=True)
if len(relevant_buckets) > NUM_DATE_BUCKETS:
relevant_buckets = relevant_buckets[:NUM_DATE_BUCKETS]
apps_per_bucket_limit = min(apps_per_bucket[relevant_buckets[-1]]) # number of apps of each type (benign/malicious) of each bucket
# Now add apps_per_bucket_limit apps from each bucket to all_apps
for bucket in relevant_buckets:
apps_found_per_bucket[bucket] = [0,0]
for app_name in app_date_map:
date_vector = []
in_relevant_bucket = False
this_bucket = ''
for bucket in relevant_buckets:
mtime = app_date_map[app_name]
startdate = int(bucket.split("_to_")[0])
enddate = int(bucket.split("_to_")[1])
if (startdate <= mtime) and (mtime < enddate):
date_vector.append(1)
in_relevant_bucket = True
this_bucket = bucket
else:
date_vector.append(0)
malicious = (app_malicious_map[app_name] == [1,0])
if in_relevant_bucket and apps_found_per_bucket[this_bucket][0 if malicious else 1] < apps_per_bucket_limit:
apps_found_per_bucket[this_bucket][0 if malicious else 1] += 1
all_apps[app_name] = {'vector': date_vector, 'malicious': app_malicious_map[app_name]}
with open('app_date_vectors.json', 'w') as outfile:
json.dump({'features': relevant_buckets, 'apps': all_apps}, outfile)
print('Wrote data on ' + str(len(relevant_buckets)) + ' date buckets and ' + str(len(all_apps)) + ' apps to a file.')
print('{} apps were before Android began and {} were after today'.format(num_apps_before, num_apps_after))
print('{} classes.dex files were not found'.format(num_file_not_found))
if __name__=='__main__':
main()