Skip to content

Commit

Permalink
[GR-60108] TRegex: NFA generator improvements.
Browse files Browse the repository at this point in the history
PullRequest: graal/19434
  • Loading branch information
djoooooe committed Dec 20, 2024
2 parents e89022b + 1ee4c15 commit 7456cd5
Show file tree
Hide file tree
Showing 79 changed files with 4,119 additions and 1,572 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
Expand Down Expand Up @@ -59,6 +59,7 @@
import com.oracle.truffle.api.strings.TruffleString;
import com.oracle.truffle.regex.RegexLanguage;
import com.oracle.truffle.regex.RegexObject;
import com.oracle.truffle.regex.RegexSyntaxException;

@TruffleLanguage.Registration(name = TRegexTestDummyLanguage.NAME, id = TRegexTestDummyLanguage.ID, characterMimeTypes = TRegexTestDummyLanguage.MIME_TYPE, version = "0.1", dependentLanguages = RegexLanguage.ID)
public class TRegexTestDummyLanguage extends TruffleLanguage<TRegexTestDummyLanguage.DummyLanguageContext> {
Expand Down Expand Up @@ -111,8 +112,12 @@ public Object execute(VirtualFrame frame) {
}
}.getCallTarget();
}
return DummyLanguageContext.get(null).getEnv().parseInternal(
Source.newBuilder(RegexLanguage.ID, src, parsingRequest.getSource().getName()).internal(true).build());
try {
return DummyLanguageContext.get(null).getEnv().parseInternal(
Source.newBuilder(RegexLanguage.ID, src, parsingRequest.getSource().getName()).internal(true).build());
} catch (RegexSyntaxException e) {
throw e.withErrorCodeInMessage();
}
}

@GenerateInline
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ public void testParseFlags() {
assertTrue(parse("i").isIgnoreCase());
assertTrue(parse("m").isMultiLine());
assertTrue(parse("s").isDotAll());
assertTrue(parse("t").isTemplate());
assertTrue(parse("u").isUnicodeExplicitlySet());
assertTrue(parse("x").isVerbose());
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
Expand Down Expand Up @@ -83,28 +83,29 @@ public void testBenchmarkRegexes() {
testInputStringGenerator(
"([-!#-''*+/-9=?A-Z^-~]+(\\.[-!#-''*+/-9=?A-Z^-~]+)*|\"([ ]!#-[^-~ ]|(\\\\[-~ ]))+\")@[0-9A-Za-z]([0-9A-Za-z-]*[0-9A-Za-z])?(\\.[0-9A-Za-z]([0-9A-Za-z-]*[0-9A-Za-z])?)+");
testInputStringGenerator("(\\S+) (\\S+) (\\S+) \\[([A-Za-z0-9_:/]+\\s[-+]\\d{4})\\] \"(\\S+)\\s?(\\S+)?\\s?(\\S+)?\" (\\d{3}|-) (\\d+|-)\\s?\"?([^\"]*)\"?\\s?\"?([^\"]*)?\"?");
testInputStringGenerator("(?<=(a))\\1");
}

private TruffleString generateInputString(String pattern, String flags, String options, Encodings.Encoding encoding) {
private TruffleString generateInputString(String pattern, String flags, String options, Encodings.Encoding encoding, long rngSeed) {
String sourceString = createSourceString(pattern, flags, options, encoding);
Source source = Source.newBuilder("regex", sourceString, "regexSource").build();
RegexSource regexSource = RegexLanguage.createRegexSource(source);
RegexAST ast = regexSource.getOptions().getFlavor().createParser(language, regexSource, new CompilationBuffer(regexSource.getEncoding())).parse();
return InputStringGenerator.generate(ast, rng.nextLong());
return InputStringGenerator.generate(ast, rngSeed);
}

void testInputStringGenerator(String pattern) {
testInputStringGenerator(pattern, "", getEngineOptions(), getTRegexEncoding());
testInputStringGenerator(pattern, "", getEngineOptions(), getTRegexEncoding(), rng.nextLong());
}

void testInputStringGenerator(String pattern, String flags, String options, Encodings.Encoding encoding) {
void testInputStringGenerator(String pattern, String flags, String options, Encodings.Encoding encoding, long rngSeed) {
Value compiledRegex = compileRegex(pattern, flags);
testInputStringGenerator(pattern, flags, options, encoding, compiledRegex);
testInputStringGenerator(pattern, flags, options, encoding, rngSeed, compiledRegex);
}

private void testInputStringGenerator(String pattern, String flags, String options, Encodings.Encoding encoding, Value compiledRegex) {
private void testInputStringGenerator(String pattern, String flags, String options, Encodings.Encoding encoding, long rngSeed, Value compiledRegex) {
for (int i = 0; i < 20; i++) {
TruffleString input = generateInputString(pattern, flags, options, encoding);
TruffleString input = generateInputString(pattern, flags, options, encoding, rngSeed);
Assert.assertNotNull(input);
Value result = execRegex(compiledRegex, encoding, input, 0);
Assert.assertTrue(result.getMember("isMatch").asBoolean());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,14 @@
*/
package com.oracle.truffle.regex.tregex.test;

import com.oracle.truffle.regex.charset.Range;
import com.oracle.truffle.regex.tregex.parser.CaseFoldData;
import com.oracle.truffle.regex.tregex.parser.flavors.java.JavaFlags;
import com.oracle.truffle.regex.tregex.string.Encodings;
import com.oracle.truffle.regex.util.EmptyArrays;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.stream.Stream;

import org.graalvm.collections.Pair;
import org.graalvm.polyglot.Context;
import org.graalvm.polyglot.PolyglotException;
Expand All @@ -53,13 +56,12 @@
import org.junit.Ignore;
import org.junit.Test;

import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.stream.Stream;
import com.oracle.truffle.regex.RegexSyntaxException.ErrorCode;
import com.oracle.truffle.regex.charset.Range;
import com.oracle.truffle.regex.tregex.parser.CaseFoldData;
import com.oracle.truffle.regex.tregex.parser.flavors.java.JavaFlags;
import com.oracle.truffle.regex.tregex.string.Encodings;
import com.oracle.truffle.regex.util.EmptyArrays;

public class JavaUtilPatternTests extends RegexTestBase {

Expand Down Expand Up @@ -163,6 +165,8 @@ public void documentationSummary() {
// Boundary matchers
test("^", 0, "");
test("$", 0, "");
test("$", 0, "empty");
test("\\Z", 0, "\r\n");
test("\\b", 0, " a", 1);
// test("\\b{g}", 0, "");
test("\\B", 0, "b");
Expand Down Expand Up @@ -1263,6 +1267,112 @@ public void caseFolding() {
});
}

@Test
public void generatedTests() {
/* GENERATED CODE BEGIN - KEEP THIS MARKER FOR AUTOMATIC UPDATES */

// Generated using Java version 24
test("((A|){7,10}?){10,17}", "", "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 0, true, 0, 86, 86, 86, 86, 86);
test("(a{1,30}){1,4}", "", "a", 0, true, 0, 1, 0, 1);
test("((a|){4,6}){4,6}", "", "aaaaaaa", 0, true, 0, 7, 7, 7, 7, 7);
test("((a?){4,6}){4,6}", "", "aaaaaaa", 0, true, 0, 7, 7, 7, 7, 7);
test("((|a){4,6}){4,6}", "", "aaaaaaa", 0, true, 0, 0, 0, 0, 0, 0);
test("((a??){4,6}){4,6}", "", "aaaaaaa", 0, true, 0, 0, 0, 0, 0, 0);
test("((a?){4,6}){4,6}", "", "aaaaaa", 0, true, 0, 6, 6, 6, 6, 6);
test("(a|^){100}", "", "a", 0, true, 0, 0, 0, 0);
test("(a|^){100}", "", "aa", 0, true, 0, 0, 0, 0);
test("(a|^){100}", "", "aa", 1, false);
test("(a|^){100}", "", "ab", 1, false);
test("(.)\\1{2,}", "", "billiam", 0, false);
test("(^_(a{1,2}[:])*a{1,2}[:]a{1,2}([.]a{1,4})?_)+", "", "_a:a:a.aaa_", 0, true, 0, 11, 0, 11, 1, 3, 6, 10);
test("(a{2}|())+$", "", "aaaa", 0, true, 0, 4, 4, 4, 4, 4);
test("^a(b*)\\1{4,6}?", "", "abbbb", 0, true, 0, 1, 1, 1);
test("^a(b*)\\1{4,6}?", "", "abbbbb", 0, true, 0, 6, 1, 2);
test("(?<=|$)", "", "a", 0, true, 0, 0);
test("(?=ab)a", "", "ab", 0, true, 0, 1);
test("(?=()|^)|x", "", "empty", 0, true, 0, 0, 0, 0);
test("a(?<=ba)", "", "ba", 0, true, 1, 2);
test("(?<=(?=|()))", "", "aa", 0, true, 0, 0, -1, -1);
test("\\d\\W", "iv", "4\u017f", 0, true, 0, 2);
test("[\u08bc-\ucf3a]", "iv", "\u03b0", 0, false);
test("a(?:|()\\1){1,2}", "", "a", 0, true, 0, 1, -1, -1);
expectSyntaxError("|(?<\\d\\1)\ub7e4", "", "", getTRegexEncoding(), "error", 0, ErrorCode.InvalidNamedGroup);
test("[a-z][a-z\u2028\u2029].|ab(?<=[a-z]w.)", "", "aac", 0, true, 0, 3);
test("(animation|animation-name)", "", "animation", 0, true, 0, 9, 0, 9);
test("(a|){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
test("(a|){7,7}?b", "", "aaab", 0, true, 0, 4, 3, 3);
test("(|a){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
test("(|a){7,7}?b", "", "aaab", 0, true, 0, 4, 3, 3);
test("(a||b){7,7}c", "", "aaabc", 0, true, 0, 5, 4, 4);
test("(a||b){7,7}c", "", "aaac", 0, true, 0, 4, 3, 3);
test("(a||b){7,7}c", "", "aaabac", 0, true, 0, 6, 5, 5);
test("($|a){7,7}", "", "aaa", 0, true, 0, 3, 3, 3);
test("($|a){7,7}?", "", "aaa", 0, true, 0, 3, 3, 3);
test("(a|$){7,7}", "", "aaa", 0, true, 0, 3, 3, 3);
test("(a|$){7,7}?", "", "aaa", 0, true, 0, 3, 3, 3);
test("(a|$|b){7,7}", "", "aaab", 0, true, 0, 4, 4, 4);
test("(a|$|b){7,7}", "", "aaa", 0, true, 0, 3, 3, 3);
test("(a|$|b){7,7}", "", "aaaba", 0, true, 0, 5, 5, 5);
test("((?=a)|a){7,7}b", "", "aaa", 0, false);
test("((?=[ab])|a){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
test("((?<=a)|a){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
test("a((?<=a)|a){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
test("(a|){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
test("(a|){0,7}?b", "", "aaab", 0, true, 0, 4, 2, 3);
test("(|a){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
test("(|a){0,7}?b", "", "aaab", 0, true, 0, 4, 2, 3);
test("(a||b){0,7}c", "", "aaabc", 0, true, 0, 5, 4, 4);
test("(a||b){0,7}c", "", "aaac", 0, true, 0, 4, 3, 3);
test("(a||b){0,7}c", "", "aaabac", 0, true, 0, 6, 5, 5);
test("((?=a)|a){0,7}b", "", "aaab", 0, true, 0, 4, 2, 3);
test("((?=[ab])|a){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
test("((?<=a)|a){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
test("a((?<=a)|a){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
test("(a*?){11,11}?b", "", "aaaaaaaaaaaaaaaaaaaaaaaaab", 0, true, 0, 26, 10, 25);
test("(?:a(b{0,19})c)", "", "abbbbbbbcdebbbbbbbf", 0, true, 0, 9, 1, 8);
test("(?:a(b{0,19})c)de", "", "abbbbbbbcdebbbbbbbf", 0, true, 0, 11, 1, 8);
test("(?<=a(b{0,19})c)de", "", "abbbbbbbcdebbbbbbbf", 0, true, 9, 11, 1, 8);
test("[\ud0d9](?<=\\S)", "", "\ud0d9", 0, true, 0, 1);
test("[\ud0d9](?<=\\W)", "", "\ud0d9", 0, true, 0, 1);
test("\u0895(?<=\\S)", "", "\u0895", 0, true, 0, 1);
test("\u0895(?<=\\W)", "", "\u0895", 0, true, 0, 1);
test("[\u8053](?<=\\S)", "", "\u8053", 0, true, 0, 1);
test("[\u8053](?<=\\W)", "", "\u8053", 0, true, 0, 1);
test("\u0895(?<=\\S)", "", "\u0895", 0, true, 0, 1);
test("\u0895(?<=\\W)", "", "\u0895", 0, true, 0, 1);
test("\u0895|[\u8053\ud0d9]+(?<=\\S\\W\\S)", "", "\ud0d9\ud0d9\ud0d9\ud0d9", 0, true, 0, 4);
test("a|[bc]+(?<=[abc][abcd][abc])", "", "bbbb", 0, true, 0, 4);
test("a(b*)*c\\1d", "", "abbbbcbbd", 0, true, 0, 9, 3, 5);
test("(|a)||b(?<=cde)|", "", "a", 0, true, 0, 0, 0, 0);
test("^(\\1)?\\D*", "s", "empty", 0, true, 0, 5, -1, -1);
test("abcd(?<=d|c()d)", "", "_abcd", 0, true, 1, 5, -1, -1);
test("\\Dw\u3aa7\\A\\S(?<=\ue3b3|\\A()\\S)", "", "\udad1\udcfaw\u3aa7A\ue3b3", 0, false);
test("a(?:c|b(?=()))*", "", "abc", 0, true, 0, 3, 2, 2);
test("a(?:c|b(?=(c)))*", "", "abc", 0, true, 0, 3, 2, 3);
test("a(?:c|(?<=(a))b)*", "", "abc", 0, true, 0, 3, 0, 1);
test("(a||b){15,18}c", "", "ababaabbaaac", 0, true, 0, 12, 11, 11);
test("(a||b){15,18}?c", "", "ababaabbaaac", 0, true, 0, 12, 11, 11);
test("(?:ab|c|^){103,104}", "", "abcababccabccabababccabcababcccccabcababababccccabcabcabccabcabcccabababccabababcababababccababccabcababcabcabccabababccccabcab", 0, true, 0, 0);
test("((?<=a)bec)*d", "", "abecd", 0, true, 1, 5, 1, 4);
test("(|(^|\\z){2,77}?)?", "", "empty", 0, true, 0, 0, 0, 0, -1, -1);
test("a(|a{15,36}){10,11}", "", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 1, 1, 1);
test("a(|a{15,36}?){10,11}", "", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 1, 1, 1);
test("a(|a{15,36}){10,11}$", "", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 66, 66, 66);
test("a(|a{15,36}?){10,11}b$", "", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab", 0, true, 0, 67, 66, 66);
test("(?:a()|b??){22,26}c", "", "aabbbaabaaaaaabaaaac", 0, true, 0, 20, 19, 19);
test("b()(a\\1|){4,4}\\2c", "", "baaaac", 0, false);
test("a((?=b()|)[a-d])+", "", "abbbcbd", 0, true, 0, 7, 6, 7, 6, 6);
test("a(?=b(?<=ab)()|)", "", "ab", 0, true, 0, 1, 2, 2);
test("[ab]*?$(?<=[^b][ab][^b])", "", "aaaaaa", 0, true, 0, 6);
test("([ab]+){0,5}", "", "bbbba", 0, true, 0, 5, 0, 5);
test("[--a]", "v", "empty", 0, false);
test("(?:^\\1|$){10,11}bc", "", "aaaaaabc", 0, false);
test("a(?:|[0-9]+?a|[0-9a]){11,13}?[ab]", "", "a372a466a109585878b", 0, true, 0, 19);
test("\\Z", "", "\r\n", 0, true, 0, 0);

/* GENERATED CODE END - KEEP THIS MARKER FOR AUTOMATIC UPDATES */
}

void test(String pattern, int flags, String input) {
test(pattern, flags, input, 0);
}
Expand Down
Loading

0 comments on commit 7456cd5

Please sign in to comment.