forked from vl2g/floco
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsyntax_match.py
85 lines (71 loc) · 2.67 KB
/
syntax_match.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from pathlib import Path
from tree_sitter import Language, Parser
from parser import (
DFG_python,
DFG_java,
DFG_ruby,
DFG_go,
DFG_php,
DFG_javascript,
DFG_csharp,
remove_comments_and_docstrings,
tree_to_token_index,
index_to_code_token,
tree_to_variable_index
)
dfg_function = {
'python': DFG_python,
'java': DFG_java,
'ruby': DFG_ruby,
'go': DFG_go,
'php': DFG_php,
'javascript': DFG_javascript,
'c_sharp': DFG_csharp,
}
root_directory = Path(__file__).parents[2]
PARSER_LOCATION = root_directory.joinpath("files_to_be_submitted/code_implementations/parser/my-languages.so")
def calc_syntax_match(references, candidate, lang):
return corpus_syntax_match([references], [candidate], lang)
def corpus_syntax_match(references, candidates, lang):
JAVA_LANGUAGE = Language(PARSER_LOCATION, lang)
parser = Parser()
parser.set_language(JAVA_LANGUAGE)
match_count = 0
total_count = 0
for i in range(len(candidates)):
references_sample = references[i]
candidate = candidates[i]
for reference in references_sample:
try:
candidate = remove_comments_and_docstrings(candidate, 'java')
except:
pass
try:
reference = remove_comments_and_docstrings(reference, 'java')
except:
pass
candidate_tree = parser.parse(bytes(candidate, 'utf8')).root_node
reference_tree = parser.parse(bytes(reference, 'utf8')).root_node
def get_all_sub_trees(root_node):
node_stack = []
sub_tree_sexp_list = []
depth = 1
node_stack.append([root_node, depth])
while len(node_stack) != 0:
cur_node, cur_depth = node_stack.pop()
sub_tree_sexp_list.append([cur_node.sexp(), cur_depth])
for child_node in cur_node.children:
if len(child_node.children) != 0:
depth = cur_depth + 1
node_stack.append([child_node, depth])
return sub_tree_sexp_list
cand_sexps = [x[0] for x in get_all_sub_trees(candidate_tree)]
ref_sexps = get_all_sub_trees(reference_tree)
for sub_tree, depth in ref_sexps:
if sub_tree in cand_sexps:
match_count += 1
total_count += len(ref_sexps)
score = match_count / total_count
return score