forked from ellisk42/ec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
max_compressor_test.py
136 lines (105 loc) · 3.71 KB
/
max_compressor_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#max testing the compressor, to get the stuff I want
from program import Primitive, Program
from grammar import Grammar
from type import tlist, tint, tbool, arrow, t0, t1, t2, tpregex
from string import printable
#from pregex import pregex
import math
# evaluation to regular regex form. then I can unflatten using Luke's stuff.
def _kleene(x): return pregex.KleeneStar(x, p=0.25)
def _plus(x): return pregex.Plus(x, p=0.25)
def _maybe(x): return pregex.Maybe(x)
# maybe should be reversed#"(" + x + "|" + y + ")"
def _alt(x): return lambda y: pregex.Alt([x, y])
def _concat(x): return lambda y: pregex.Concat([x, y]) # "(" + x + y + ")"
#def _wrapper(x): return lambda y: y
#specials = [".","*","+","?","|"]
disallowed = [
("#", "hash"),
("!", "bang"),
("\"", "double_quote"),
("$", "dollar"),
("%", "percent"),
("&", "ampersand"),
("'", "single_quote"),
(")", "left_paren"),
("(", "right_paren"),
("*", "astrisk"),
("+", "plus"),
(",", "comma"),
("-", "dash"),
(".", "period"),
("/", "slash"),
(":", "colon"),
(";", "semicolon"),
("<", "less_than"),
("=", "equal"),
(">", "greater_than"),
("?", "question_mark"),
("@", "at"),
("[", "left_bracket"),
("\\", "backslash"),
("]", "right_bracket"),
("^", "carrot"),
("_", "underscore"),
("`", "backtick"),
("|", "bar"),
("}", "right_brace"),
("{", "left_brace"),
("~", "tilde"),
(" ", "space"),
("\t", "tab")
]
disallowed_list = [char for char, _ in disallowed]
def altPrimitives():
return [
Primitive("empty_string", tpregex, None)
] + [
Primitive("string_" + i, tpregex, None) for i in printable[:-4] if i not in disallowed_list
] + [
Primitive("string_" + name, tpregex, None) for char, name in disallowed
] + [
Primitive("r_dot", tpregex, None),
Primitive("r_d", tpregex, None),
Primitive("r_s", tpregex, None),
Primitive("r_w", tpregex, None),
Primitive("r_l", tpregex, None),
Primitive("r_u", tpregex, None),
Primitive("r_kleene", arrow(tpregex, tpregex), None),
#Primitive("r_plus", arrow(tpregex, tpregex), _plus),
Primitive("r_maybe", arrow(tpregex, tpregex), None),
Primitive("r_alt", arrow(tpregex, tpregex, tpregex), None),
Primitive("r_concat", arrow(tpregex, tpregex, tpregex), None),
]
from grammar import *
prim_list = altPrimitives()
n_base_prim = len(prim_list) - 5.
specials = ["r_kleene", "r_plus", "r_maybe", "r_alt", "r_concat"]
productions = [
(0.25 / n_base_prim,
prim) if prim.name not in specials else (
0.15,
prim) for prim in prim_list]
baseGrammar = Grammar.fromProductions(productions)
#for testing stuff
from program import *
from frontier import Frontier
from fragmentGrammar import *
frontiers = []
program1 = Program.parse("(r_concat r_dot (r_kleene r_dot))")
frontier = Frontier.dummy(program1, logLikelihood=0., logPrior=0.)
frontiers.append(frontier)
program2 = Program.parse("(r_concat r_d (r_kleene r_d))" )
frontier = Frontier.dummy(program2, logLikelihood=0., logPrior=0.)
frontiers.append(frontier)
program3 = Program.parse("(r_concat r_u (r_kleene r_u))" )
frontier = Frontier.dummy(program3, logLikelihood=0., logPrior=0.)
frontiers.append(frontier)
program4 = Program.parse("(r_concat r_w (r_kleene r_w))" )
frontier = Frontier.dummy(program4, logLikelihood=0., logPrior=0.)
frontiers.append(frontier)
grammar, frontiers = induceGrammar(baseGrammar, frontiers,
topK=5, topk_use_map=False,
pseudoCounts=1.0, a=3,
aic=0.0, structurePenalty=.1,
backend='rust', CPUs=1, iteration=1)