-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCompiler.java
460 lines (378 loc) · 9.6 KB
/
Compiler.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
/**
* This class implements a regular expression compiler using the
* following rules:
*
* E -> D
* E -> DE
* D -> T
* D -> T|D
* T -> F
* T -> F*
* T -> F+
* T -> F?
* F -> \v
* F -> v
* F -> .
* F -> []L]
* F -> [L]
* F -> (E)
* L -> v
* L -> vL
*
* Where E is an expression, T is a term, F is a factor, L is a list
* and v is a vocab item (a literal).
*
* This program will accept a regex from the command line, and then
* output the compiled FSM to stdout.
*
* Dan Collins 1183446
* Severin Mahoney Marsh 1181754
*/
import java.util.ArrayList;
public class Compiler {
class Node {
String literal;
int next1;
int next2;
public Node(String lit, int n1, int n2) {
literal = lit;
next1 = n1;
next2 = n2;
}
public void setNext1(int n1) {
next1 = n1;
}
public void setNext2(int n2) {
next2 = n2;
}
public int getNext1() {
return next1;
}
public int getNext2() {
return next2;
}
public String toString() {
return String.format("%s\n%d\n%d", literal, next1, next2);
//return String.format("%s,%d,%d", literal, next1, next2);
}
}
private String exp;
private int index;
private int state;
private ArrayList<Node> fsm;
public static char[] SPECIAL_CHARS = {'\\', '*', '+', '?', '|',
'.', '(', ')', '[', ']'};
public Compiler() {
fsm = new ArrayList<Node>();
}
public void setExpression(String exp) {
this.exp = exp;
}
public void compile() throws IllegalArgumentException {
int initial;
index = 0;
state = 1;
try {
// Create a placeholder state for the initial state.
// We'll update it later.
setState(0, "NULL", 0, 0);
initial = expression();
setState(0, initial, initial);
} catch (StringIndexOutOfBoundsException e) {
error();
}
if (index != exp.length())
error();
// Point back to the start of the FSM
setState(state, "NULL", 0, 0);
}
private int expression() throws IllegalArgumentException {
int r, e1, s2;
r = disjunction();
e1 = state-1;
// Test if we've reached the end of the pattern
if (index == exp.length())
return r;
// Recursive if a valid character follows
if (isVocab(exp.charAt(index)) ||
exp.charAt(index) == '\\' ||
exp.charAt(index) == '(' ||
exp.charAt(index) == '.' ||
exp.charAt(index) == '[') {
s2 = expression();
setState(e1, s2, s2);
}
return r;
}
private int disjunction() throws IllegalArgumentException {
int r, s1, e1, s2, e2;
// Get start and end states for the term
s1 = term();
e1 = state-1;
r = s1;
// Test if we've reached the end of the pattern
if (index == exp.length())
return r;
// Recurse if this is a disjunction
if (exp.charAt(index) == '|') {
// Consume |
index++;
// Create a branching machine to point to the start of
// the term, and the start of the disjunction
setState(state, "NULL", s1, state+1);
r = state;
state++;
// Get the start and end states for the disjunction
s2 = disjunction();
e2 = state-1;
// Update the branching machine to point to the start of
// the term and the new disjunction
setState(r, s1, s2);
// Create an end state for this machine
setState(state, "NULL", state+1, state+1);
state++;
// Point both terms to the end state
setState(e1, state-1, state-1);
setState(e2, state-1, state-1);
}
// Return the start state of this machine
return r;
}
private int term() throws IllegalArgumentException {
int r, start;
// Save the state pointing to this one
start = state-1;
r = factor();
// Test if we've reached the end of the pattern, and if we
// have return the start state for the factor machine
if (index == exp.length())
return r;
// Zero or more
if (exp.charAt(index) == '*') {
// Create a branching state that points to the factor
// and the state after the factor.
setState(state, "NULL", r, state+1);
r = state;
state++;
// Consume *
index++;
}
// One or more
else if (exp.charAt(index) == '+') {
// Create a branching state that points to the factor,
// and the state after the factor
setState(state, "NULL", r, state+1);
state++;
// Consume +
index++;
}
// Zero or one
else if (exp.charAt(index) == '?') {
// Create a branching state that points to the factor,
// and the state after the factor
setState(state, "NULL", r, state+1);
r = state;
state++;
// Create an end state for this machine
setState(state, "NULL", state+1, state+1);
state++;
// Point the factor to the end state
setState(r, state-1, state-1);
r = state-1;
index++;
}
// Return the start state of this machine
return r;
}
private int factor() throws IllegalArgumentException {
int r, l;
r = state;
// Escaped characters
if (exp.charAt(index) == '\\') {
// Consume \
index++;
// Update FSM to contain character
setState(state, String.valueOf(exp.charAt(index)),
state+1, state+1);
state++;
// Consume escaped literal
index++;
}
// Literals
else if (isVocab(exp.charAt(index))) {
// Update FSM to contain character
setState(state, String.valueOf(exp.charAt(index)),
state+1, state+1);
state++;
// Consume literal
index++;
}
// Any literal
else if (exp.charAt(index) == '.') {
// Update FSM to contain character
setState(state, "WILD", state+1, state+1);
state++;
// Consume wild card
index++;
}
// List of literals
else if (exp.charAt(index) == '[') {
// Consume [
index++;
// If the first character is ], then it should be in the
// literal list. The easiest way is to either have a null
// state pointing to the start of the list, or having the
// consuming ']' state pointing to the start of the list.
// Is is a waste of a state..!
// Int to track ']'
int bracket = -1;
if (exp.charAt(index) == ']') {
// Consume ]
index++;
bracket = state;
// Create a state to consume the ]
setState(state, "]", -1, -1);
state++;
}
// Figure out where the list starts (and also build the
// rest of the list FSM).
r = state;
list();
// Add the square bracket if nessicary
if (bracket >= 0){
// The branching state will include it's child and bracket
setState(state-2, state-1, bracket);
// The bracket
setState(bracket, state, state);
}
// Make a branching maching to catch all the loose ends
setState(state, "NULL", state+1, state+1);
state++;
// Make sure the list was valid
if (exp.charAt(index) == ']') {
// Consume ]
index++;
} else
error();
}
// Nested expression
else if (exp.charAt(index) == '(') {
index++;
r = expression();
if (exp.charAt(index) == ')') {
index++;
} else
error();
}
// This isn't a factor!
else
error();
// Return the start state of this machine
return r;
}
private void list() throws IllegalArgumentException {
if (exp.charAt(index) != ']') {
// Make sure there is a character available. It is an
// error for there not to be!
if (index == exp.length())
error();
// Decide if we are the last item in the list
boolean last = exp.charAt(index+1) == ']';
// Create a branching machine to point to the literal and if
// necessary to the next branching machine of the list
setState(state, "NULL", last ? state+1 : state+2, state+1);
state++;
// Create a state to consume the literal and remember it's
// position for later
int lit = state;
setState(state, String.valueOf(exp.charAt(index)), -1, -1);
state++;
index++;
// Recurse if nessicary
if (!last)
list();
// Set the literal states end to the end of the list
setState(lit, state, state);
}
}
private void setState(int state, String s, int n1, int n2) {
Node n = new Node(s, n1, n2);
// State will replace an existing state
if (state < fsm.size()) {
try {
fsm.remove(state);
fsm.add(state, n);
} catch (IndexOutOfBoundsException e) {
System.err.println("This shouldn't happen..!");
e.printStackTrace();
System.exit(-1);
}
}
// State is a new state
else if (state == fsm.size()) {
fsm.add(n);
}
// This is invalid
else {
System.err.println("State value is too large for list!");
System.err.printf("state: %d, fsm.size(): %d\n", state,
fsm.size());
System.exit(-1);
}
}
private void setState(int state, int n1, int n2) {
Node n = fsm.remove(state);
// If both new links point to the same place, we handle it
// a bit differently.
if (n1 == n2) {
// If the links are the same, then set both
if (n.getNext1() == n.getNext2()) {
n.setNext1(n1);
n.setNext2(n1);
}
// Otherwise, just set the second link
else
n.setNext2(n1);
} else {
n.setNext1(n1);
n.setNext2(n2);
}
fsm.add(state, n);
}
private boolean isVocab(char c) {
for (char s : SPECIAL_CHARS) {
if (s == c)
return false;
}
return true;
}
private void error() throws IllegalArgumentException {
String message = String.format("%s is an invalid regex!",
exp);
throw new IllegalArgumentException(message);
}
public ArrayList<Node> getFSM() {
return this.fsm;
}
public static void main(String args[]) {
ArrayList<Node> fsm;
if (args.length < 1) {
System.err.println("This program requires a regex as an" +
"argument!");
return;
}
Compiler c = new Compiler();
c.setExpression(args[0]);
try {
c.compile();
} catch (IllegalArgumentException e) {
System.err.println(e.getMessage());
System.exit(-1);
}
fsm = c.getFSM();
for (int i = 0; i < fsm.size(); i++) {
//System.out.printf("%d: %s\n", i, fsm.get(i));
System.out.printf("%s\n", fsm.get(i));
}
}
}