forked from Yunlongs/Goshawk
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimilarity_inference.py
127 lines (104 loc) · 4.36 KB
/
similarity_inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import tensorflow as tf
from normalize import normalize_on_file
from embedding import extract_embedding_per_file, load_embedding, create_link_func_string, get_reference_embedding, \
load_embedding_to_list
import os
import config
import numpy as np
import time
from data_proccess.get_label_data import convert_origin_to_prototype, convert_prototype_to_json
def norm(vector):
"""
l2 norm.
:param vector:
:return:
"""
res = np.sqrt(np.sum(np.square(vector)))
return vector / res
def extract_embedding(model, input_path, out_path):
"""
extract embedding for a file.
"""
extract_embedding_per_file(model, input_path, out_path)
def calculate_similarity(target_embedding_file, link_func_string, link_func_name, model_name):
"""
When getting the embedding of target functions, we can inference their similarities.
:param target_embedding_file:
:param out_path: save the similarities result to a file.
:return:
"""
target_embedding = load_embedding_to_list(target_embedding_file)
mean_embedding = get_reference_embedding(model_name).reshape((config.embedding_size,))
result = ""
func_cos_dict = {}
func_name_cos_dict = {}
for index, v in enumerate(target_embedding):
vector = norm(v)
cos = np.dot(vector, mean_embedding)
func_cos_dict[link_func_string[index]] = cos
func_name_cos_dict[link_func_name[index]] = cos
func_cos_dict = sorted(func_cos_dict.items(), key=lambda d: d[1], reverse=True)
for k, cos in func_cos_dict:
result += " ".join([k, str(cos)]) + "\n"
result += "-----------------\n"
with open(config.func_sim_path, "w") as f:
f.write(result)
with open(config.func_name_sim_path, "w") as f:
for func_name, cos in func_name_cos_dict.items():
f.write(" ".join([func_name, str(cos)]) + "\n")
return func_name_cos_dict
def calculate_similarity_no_sort(test_embedding_file, type):
test_embedding = load_embedding_to_list(test_embedding_file)
mean_embedding = get_reference_embedding(type)
similarity_list = []
for v in test_embedding:
vector = norm(v)
cos = np.dot(vector, mean_embedding)
similarity_list.append(cos)
return similarity_list
def working_on_json_function_prototype(model, json_file, model_name):
"""
Given a file of function prototypes with json type, we first normalize and segment these function prototypes.
Then generate embeddings for function prototypes.
Finally, calculate their similarity according to reference embedding, and save the result to a file.
:param model: Your trained instantiated model.
:param json_file: input json file of function prototypes.
:return:
"""
normalize_on_file(json_file, "alloc", "temp/func_seg")
start = time.time()
extract_embedding(model, "temp/func_seg", "temp/embedding")
end = time.time()
print("extracting time:", end - start)
link_func_string, link_func_name = create_link_func_string(json_file, "temp/func_seg")
func_similarity_as_name = calculate_similarity("temp/embedding", link_func_string, link_func_name, model_name)
return func_similarity_as_name
def working_on_raw_function_prototype(model_name, filename):
"""
Convert raw function prototypes to json type.
And then we can get their similarities by function 'working_on_json_function_prototype'.
:param model: Your trained instantiated model.
:param funcs:
:param type:
:return:
"""
model = get_model(model_name)
convert_origin_to_prototype(filename)
convert_prototype_to_json(filename)
_ = working_on_json_function_prototype(model, filename, model_name)
def list_model():
print("Current available model:")
result = ", ".join(os.listdir("model"))
print(result)
def get_model(model_name):
list_model()
assert model_name in os.listdir("model"), "model name %s not in directory \"model\""%(model_name)
model = tf.keras.models.load_model(config.model_dir + os.sep + model_name + os.sep + "maxauc_model")
return model
def similarity_inference(model_name, input_file):
import shutil
copy_file = config.temp_dir + os.sep + os.path.basename(input_file)
shutil.copy(input_file, copy_file)
working_on_raw_function_prototype(model_name, copy_file)
if __name__ == "__main__":
working_on_raw_function_prototype("alloc","subword_dataset/test.func")