diff --git a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/matchers/MatcherBuilderTest.java b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/matchers/MatcherBuilderTest.java index d1c55cf0dcd1..780c445bc62a 100644 --- a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/matchers/MatcherBuilderTest.java +++ b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/matchers/MatcherBuilderTest.java @@ -49,6 +49,7 @@ import com.oracle.truffle.regex.charset.CodePointSetAccumulator; import com.oracle.truffle.regex.charset.Range; import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; +import com.oracle.truffle.regex.tregex.string.Encodings; public class MatcherBuilderTest { @@ -110,25 +111,26 @@ private static void checkContains(CodePointSet a, CodePointSet b, boolean expect } private static void checkInverse(CodePointSet a, int... values) { - checkMatch("inverse(" + a + ")", a.createInverse(), values); + checkMatch("inverse(" + a + ")", a.createInverse(Encodings.UTF_16), values); } private static void checkIntersection(CodePointSet a, CodePointSet b, int... values) { - CodePointSet intersection = a.createIntersection(b, new CompilationBuffer()); + CompilationBuffer compilationBuffer = new CompilationBuffer(Encodings.UTF_16); + CodePointSet intersection = a.createIntersection(b, compilationBuffer); checkMatch("intersection(" + a + "," + b + ")", intersection, values); assertTrue("intersection(" + a + "," + b + ")", a.intersects(b) == intersection.matchesSomething()); - CodePointSet.IntersectAndSubtractResult result = a.intersectAndSubtract(b, new CompilationBuffer()); - checkMatch("intersectAndSubtract(" + a + "," + b + ")[0]", result.subtractedA, a.subtract(intersection, new CompilationBuffer())); - checkMatch("intersectAndSubtract(" + a + "," + b + ")[1]", result.subtractedB, b.subtract(intersection, new CompilationBuffer())); + CodePointSet.IntersectAndSubtractResult result = a.intersectAndSubtract(b, compilationBuffer); + checkMatch("intersectAndSubtract(" + a + "," + b + ")[0]", result.subtractedA, a.subtract(intersection, compilationBuffer)); + checkMatch("intersectAndSubtract(" + a + "," + b + ")[1]", result.subtractedB, b.subtract(intersection, compilationBuffer)); checkMatch("intersectAndSubtract(" + a + "," + b + ")[2]", result.intersection, intersection); } private static void checkSubtraction(CodePointSet a, CodePointSet b, int... values) { - checkMatch("subtraction(" + a + "," + b + ")", a.subtract(b, new CompilationBuffer()), values); + checkMatch("subtraction(" + a + "," + b + ")", a.subtract(b, new CompilationBuffer(Encodings.UTF_16)), values); } private static void checkUnion(CodePointSet a, CodePointSet b, int... values) { - checkMatch("union(" + a + "," + b + ")", a.union(b, new CompilationBuffer()), values); + checkMatch("union(" + a + "," + b + ")", a.union(b, new CompilationBuffer(Encodings.UTF_16)), values); } @Test diff --git a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/matchers/RangesBufferTest.java b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/matchers/RangesBufferTest.java index 614d8de27c4a..25b387605bfc 100644 --- a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/matchers/RangesBufferTest.java +++ b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/matchers/RangesBufferTest.java @@ -44,7 +44,6 @@ import org.junit.Test; import com.oracle.truffle.regex.charset.RangesBuffer; -import com.oracle.truffle.regex.tregex.buffer.CharRangesBuffer; import com.oracle.truffle.regex.tregex.buffer.IntRangesBuffer; public class RangesBufferTest { @@ -55,12 +54,6 @@ private static void appendAll(RangesBuffer buf, int[] content) { } } - private static CharRangesBuffer createCharRangesBuffer(int[] content) { - CharRangesBuffer buf = new CharRangesBuffer(); - appendAll(buf, content); - return buf; - } - private static IntRangesBuffer createIntRangesBuffer(int[] content) { IntRangesBuffer buf = new IntRangesBuffer(); appendAll(buf, content); @@ -68,10 +61,7 @@ private static IntRangesBuffer createIntRangesBuffer(int[] content) { } private static boolean equals(RangesBuffer buf, int[] content) { - if (buf instanceof IntRangesBuffer) { - return equals((IntRangesBuffer) buf, content); - } - return equals((CharRangesBuffer) buf, content); + return equals((IntRangesBuffer) buf, content); } private static boolean equals(IntRangesBuffer buf, int[] content) { @@ -86,18 +76,6 @@ private static boolean equals(IntRangesBuffer buf, int[] content) { return true; } - private static boolean equals(CharRangesBuffer buf, int[] content) { - if (buf.size() != content.length / 2) { - return false; - } - for (int i = 0; i < content.length / 2; i++) { - if (buf.getLo(i) != (char) content[i * 2] || buf.getHi(i) != (char) content[i * 2 + 1]) { - return false; - } - } - return true; - } - private static String matchError(String errorMsg, RangesBuffer m, int[] expected) { StringBuilder sb = new StringBuilder(errorMsg).append(": got ").append(m.toString()).append(", expected [ "); for (int i = 0; i < expected.length; i += 2) { @@ -115,34 +93,23 @@ private static void checkAddRange(RangesBuffer buf, int lo, int hi, int[] expect private static void checkAddRange(int[] buf, int lo, int hi, int[] expected) { checkAddRange(createIntRangesBuffer(buf), lo, hi, expected); - checkAddRange(createCharRangesBuffer(buf), lo, hi, expected); - } - - private static void checkAddRange(int[] buf, int lo, int hi, int[] expected, boolean intBuffer) { - if (intBuffer) { - checkAddRange(createIntRangesBuffer(buf), lo, hi, expected); - } else { - checkAddRange(createCharRangesBuffer(buf), lo, hi, expected); - } } @Test public void testAddRange() { + int max = Character.MAX_VALUE; checkAddRange(new int[]{}, 0, 1, new int[]{0, 1}); - for (boolean intBuffer : new boolean[]{true, false}) { - int max = intBuffer ? Character.MAX_CODE_POINT : Character.MAX_VALUE; - checkAddRange(new int[]{}, 0, max, new int[]{0, max}, intBuffer); - checkAddRange(new int[]{0, 0}, 0, max, new int[]{0, max}, intBuffer); - checkAddRange(new int[]{1, 1}, 0, max, new int[]{0, max}, intBuffer); - checkAddRange(new int[]{0, 2}, 0, max, new int[]{0, max}, intBuffer); - checkAddRange(new int[]{max - 1, max}, 0, max, new int[]{0, max}, intBuffer); - checkAddRange(new int[]{max - 1, max - 1}, 0, max, new int[]{0, max}, intBuffer); - checkAddRange(new int[]{max, max}, 0, max, new int[]{0, max}, intBuffer); - checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, 0, max, new int[]{0, max}, intBuffer); - checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, 0, max, new int[]{0, max}, intBuffer); - checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, max, max, new int[]{2, 2, 4, 4, 6, 7, max, max}, intBuffer); - checkAddRange(new int[]{2, 2, 4, 4, 6, 7, max, max}, max - 1, max - 1, new int[]{2, 2, 4, 4, 6, 7, max - 1, max}, intBuffer); - } + checkAddRange(new int[]{}, 0, max, new int[]{0, max}); + checkAddRange(new int[]{0, 0}, 0, max, new int[]{0, max}); + checkAddRange(new int[]{1, 1}, 0, max, new int[]{0, max}); + checkAddRange(new int[]{0, 2}, 0, max, new int[]{0, max}); + checkAddRange(new int[]{max - 1, max}, 0, max, new int[]{0, max}); + checkAddRange(new int[]{max - 1, max - 1}, 0, max, new int[]{0, max}); + checkAddRange(new int[]{max, max}, 0, max, new int[]{0, max}); + checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, 0, max, new int[]{0, max}); + checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, 0, max, new int[]{0, max}); + checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, max, max, new int[]{2, 2, 4, 4, 6, 7, max, max}); + checkAddRange(new int[]{2, 2, 4, 4, 6, 7, max, max}, max - 1, max - 1, new int[]{2, 2, 4, 4, 6, 7, max - 1, max}); checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, 2, 2, new int[]{2, 2, 4, 4, 6, 7}); checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, 9, 9, new int[]{2, 2, 4, 4, 6, 7, 9, 9}); checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, 8, 8, new int[]{2, 2, 4, 4, 6, 8}); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/RegexCompilerWithFallback.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/RegexCompilerWithFallback.java index ad5e46efc81c..f496b83674e3 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/RegexCompilerWithFallback.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/RegexCompilerWithFallback.java @@ -40,15 +40,12 @@ */ package com.oracle.truffle.regex; +import java.util.logging.Level; + import com.oracle.truffle.api.CompilerDirectives; import com.oracle.truffle.api.interop.TruffleObject; import com.oracle.truffle.regex.tregex.util.DebugUtil; - -import java.util.logging.Level; - -import static com.oracle.truffle.regex.tregex.util.DebugUtil.LOG_BAILOUT_MESSAGES; -import static com.oracle.truffle.regex.tregex.util.DebugUtil.LOG_COMPILER_FALLBACK; -import static com.oracle.truffle.regex.tregex.util.DebugUtil.LOG_TOTAL_COMPILATION_TIME; +import com.oracle.truffle.regex.tregex.util.Loggers; public class RegexCompilerWithFallback implements RegexCompiler { @@ -79,9 +76,9 @@ public Object compile(RegexSource regexSource) throws RegexSyntaxException, Unsu if (shouldLog) { elapsedTimeMain = timer.getElapsed(); } - LOG_COMPILER_FALLBACK.finer(() -> "Primary compiler used: " + regexSource); + Loggers.LOG_COMPILER_FALLBACK.finer(() -> "Primary compiler used: " + regexSource); } catch (UnsupportedRegexException mainBailout) { - LOG_BAILOUT_MESSAGES.fine(() -> mainBailout.getReason() + ": " + regexSource); + Loggers.LOG_BAILOUT_MESSAGES.fine(() -> mainBailout.getReason() + ": " + regexSource); try { if (shouldLog) { timer.start(); @@ -90,9 +87,9 @@ public Object compile(RegexSource regexSource) throws RegexSyntaxException, Unsu if (shouldLog) { elapsedTimeFallback = timer.getElapsed(); } - LOG_COMPILER_FALLBACK.fine(() -> String.format("Secondary compiler used (primary bailout due to '%s'): %s", mainBailout.getReason(), regexSource)); + Loggers.LOG_COMPILER_FALLBACK.fine(() -> String.format("Secondary compiler used (primary bailout due to '%s'): %s", mainBailout.getReason(), regexSource)); } catch (UnsupportedRegexException fallbackBailout) { - LOG_COMPILER_FALLBACK.fine(() -> String.format("No compiler handled following regex (primary bailout: '%s'; secondary bailout: '%s'): %s", mainBailout.getReason(), + Loggers.LOG_COMPILER_FALLBACK.fine(() -> String.format("No compiler handled following regex (primary bailout: '%s'; secondary bailout: '%s'): %s", mainBailout.getReason(), fallbackBailout.getReason(), regexSource)); String bailoutReasons = String.format("%s; %s", mainBailout.getReason(), fallbackBailout.getReason()); throw new UnsupportedRegexException(bailoutReasons, regexSource); @@ -105,11 +102,11 @@ public Object compile(RegexSource regexSource) throws RegexSyntaxException, Unsu } private static boolean shouldLogCompilationTime() { - return LOG_TOTAL_COMPILATION_TIME.isLoggable(Level.FINE); + return Loggers.LOG_TOTAL_COMPILATION_TIME.isLoggable(Level.FINE); } private static void logCompilationTime(RegexSource regexSource, long elapsedTimeMain, long elapsedTimeFallback) { - LOG_TOTAL_COMPILATION_TIME.log(Level.FINE, "{0}, {1}, {2}, {3}", new Object[]{ + Loggers.LOG_TOTAL_COMPILATION_TIME.log(Level.FINE, "{0}, {1}, {2}, {3}", new Object[]{ DebugUtil.Timer.elapsedToString(elapsedTimeMain + elapsedTimeFallback), DebugUtil.Timer.elapsedToString(elapsedTimeMain), DebugUtil.Timer.elapsedToString(elapsedTimeFallback), diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/analysis/RegexUnifier.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/analysis/RegexUnifier.java index 2292ca31a487..ce48630425ca 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/analysis/RegexUnifier.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/analysis/RegexUnifier.java @@ -46,6 +46,7 @@ import com.oracle.truffle.regex.RegexSyntaxException; import com.oracle.truffle.regex.tregex.parser.RegexLexer; import com.oracle.truffle.regex.tregex.parser.Token; +import com.oracle.truffle.regex.tregex.string.Encodings; /** * Generates a "unified" regular expression representation where all single characters are replaced @@ -62,7 +63,7 @@ public final class RegexUnifier { public RegexUnifier(RegexSource source) { this.source = source; - this.lexer = new RegexLexer(source, RegexFlags.parseFlags(source.getFlags()), RegexOptions.DEFAULT); + this.lexer = new RegexLexer(source, RegexFlags.parseFlags(source.getFlags()), Encodings.UTF_32, RegexOptions.DEFAULT); this.dump = new StringBuilder(source.getPattern().length()); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CharMatchers.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CharMatchers.java index 6381183aaf30..0c8c7c8a7e16 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CharMatchers.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CharMatchers.java @@ -40,11 +40,11 @@ */ package com.oracle.truffle.regex.charset; +import static com.oracle.truffle.regex.util.BitSets.highByte; +import static com.oracle.truffle.regex.util.BitSets.lowByte; + import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; -import com.oracle.truffle.regex.tregex.TRegexOptions; import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; -import com.oracle.truffle.regex.tregex.buffer.IntRangesBuffer; -import com.oracle.truffle.regex.tregex.buffer.ObjectArrayBuffer; import com.oracle.truffle.regex.tregex.matchers.AnyMatcher; import com.oracle.truffle.regex.tregex.matchers.BitSetMatcher; import com.oracle.truffle.regex.tregex.matchers.CharMatcher; @@ -52,14 +52,11 @@ import com.oracle.truffle.regex.tregex.matchers.HybridBitSetMatcher; import com.oracle.truffle.regex.tregex.matchers.InvertibleCharMatcher; import com.oracle.truffle.regex.tregex.matchers.MultiBitSetMatcher; -import com.oracle.truffle.regex.tregex.matchers.ProfilingCharMatcher; import com.oracle.truffle.regex.tregex.matchers.RangeListMatcher; import com.oracle.truffle.regex.tregex.matchers.RangeTreeMatcher; import com.oracle.truffle.regex.tregex.matchers.SingleCharMatcher; import com.oracle.truffle.regex.tregex.matchers.SingleRangeMatcher; import com.oracle.truffle.regex.tregex.matchers.TwoCharMatcher; -import com.oracle.truffle.regex.tregex.string.Encodings; -import com.oracle.truffle.regex.tregex.string.Encodings.Encoding; import com.oracle.truffle.regex.util.CompilationFinalBitSet; /** @@ -67,47 +64,19 @@ */ public class CharMatchers { - /** - * Create a new {@link CharMatcher} from the given code point set, based on {@code encoding}: - * - */ - public static CharMatcher createMatcher(CodePointSet cps, Encoding encoding, CompilationBuffer compilationBuffer) { - if (encoding == Encodings.UTF_16) { - return createMatcherIntl(cps, compilationBuffer); - } else if (encoding == Encodings.UTF_16_RAW) { - return createMatcherIntl(CodePointSetBMPView.create(cps), compilationBuffer); - } else { - throw new UnsupportedOperationException(); - } - } - - private static CharMatcher createMatcherIntl(ImmutableSortedListOfIntRanges cps, CompilationBuffer compilationBuffer) { - if (cps.matchesMinAndMax() || cps.inverseIsSameHighByte()) { + public static CharMatcher createMatcher(CodePointSet cps, CompilationBuffer compilationBuffer) { + if (cps.matchesMinAndMax(compilationBuffer.getEncoding()) || cps.inverseIsSameHighByte(compilationBuffer.getEncoding())) { // the inverse of the given set is easier to match, generate inverted matcher - return createMatcher(cps.createInverse(), compilationBuffer, true); + return createMatcher(cps.createInverse(compilationBuffer.getEncoding()), compilationBuffer, true); } return createMatcher(cps, compilationBuffer, false); } - public static int highByte(int c) { - return c >> Byte.SIZE; - } - - public static int lowByte(int c) { - return c & 0xff; - } - - private static CharMatcher createMatcher(ImmutableSortedListOfIntRanges cps, CompilationBuffer compilationBuffer, boolean inverse) { + private static CharMatcher createMatcher(CodePointSet cps, CompilationBuffer compilationBuffer, boolean inverse) { if (cps.isEmpty()) { return EmptyMatcher.create(inverse); } - if (cps.matchesEverything()) { + if (cps.matchesEverything(compilationBuffer.getEncoding())) { return AnyMatcher.create(inverse); } if (cps.matchesSingleChar()) { @@ -123,145 +92,37 @@ private static CharMatcher createMatcher(ImmutableSortedListOfIntRanges cps, Com if (preferRangeListMatcherOverBitSetMatcher(cps, size)) { return RangeListMatcher.create(inverse, cps.toArray()); } - InvertibleCharMatcher bitSetMatcher = convertToBitSetMatcher(cps, compilationBuffer, inverse); - if (bitSetMatcher != null) { - return bitSetMatcher; + if (highByte(cps.getMin()) == highByte(cps.getMax())) { + return convertToBitSetMatcher(cps, compilationBuffer, inverse); } - CharMatcher charMatcher; if (size > 100 && cps.getMax() <= 0xffff) { - charMatcher = MultiBitSetMatcher.fromRanges(inverse, cps); + return MultiBitSetMatcher.fromRanges(inverse, cps); } else { - charMatcher = createHybridMatcher(cps, compilationBuffer, inverse); + CompressedCodePointSet ccps = CompressedCodePointSet.create(cps, compilationBuffer); + if (ccps.hasBitSets()) { + return HybridBitSetMatcher.create(inverse, ccps); + } else if (ccps.size() <= 10) { + return RangeListMatcher.create(inverse, ccps.getRanges()); + } else { + return RangeTreeMatcher.fromRanges(inverse, ccps.getRanges()); + } } - ImmutableSortedListOfIntRanges byteRange = cps instanceof CodePointSet ? Constants.BYTE_RANGE : CodePointSetBMPView.create(Constants.BYTE_RANGE); - return ProfilingCharMatcher.create(createMatcher(cps.createIntersection(byteRange, compilationBuffer), compilationBuffer, inverse), charMatcher); } - private static boolean preferRangeListMatcherOverBitSetMatcher(ImmutableSortedListOfIntRanges cps, int size) { + private static boolean preferRangeListMatcherOverBitSetMatcher(CodePointSet cps, int size) { // for up to two ranges, RangeListMatcher is faster than any BitSet matcher // also, up to four single character checks are still faster than a bit set return size <= 2 || cps.valueCountMax(4); } - private static InvertibleCharMatcher convertToBitSetMatcher(ImmutableSortedListOfIntRanges cps, CompilationBuffer compilationBuffer, boolean inverse) { + private static InvertibleCharMatcher convertToBitSetMatcher(CodePointSet cps, CompilationBuffer compilationBuffer, boolean inverse) { int highByte = highByte(cps.getMin()); - if (highByte(cps.getMax()) != highByte) { - return null; - } CompilationFinalBitSet bs = compilationBuffer.getByteSizeBitSet(); for (int i = 0; i < cps.size(); i++) { assert highByte(cps.getLo(i)) == highByte && highByte(cps.getHi(i)) == highByte; bs.setRange(lowByte(cps.getLo(i)), lowByte(cps.getHi(i))); } - return BitSetMatcher.create(inverse, highByte, bs.copy()); - } - - private static CharMatcher createHybridMatcher(ImmutableSortedListOfIntRanges cps, CompilationBuffer compilationBuffer, boolean inverse) { - int size = cps.size(); - assert size >= 1; - IntRangesBuffer ranges = compilationBuffer.getIntRangesBuffer1(); - ObjectArrayBuffer bitSets = compilationBuffer.getObjectBuffer1(); - // index of lowest range on current plane - int lowestOCP = 0; - int curPlane = highByte(cps.getHi(0)); - for (int i = 0; i < size; i++) { - if (highByte(cps.getLo(i)) != curPlane) { - processRanges(cps, ranges, bitSets, lowestOCP, i, i, curPlane); - curPlane = highByte(cps.getLo(i)); - lowestOCP = i; - } - if (highByte(cps.getHi(i)) != curPlane) { - if (lowestOCP != i) { - processRanges(cps, ranges, bitSets, lowestOCP, i, i + 1, curPlane); - } - curPlane = highByte(cps.getHi(i)); - lowestOCP = i; - } - } - processRanges(cps, ranges, bitSets, lowestOCP, size, size, curPlane); - if (bitSets.isEmpty()) { - assert ranges.length() == size * 2; - if (size <= 10) { - return RangeListMatcher.create(inverse, cps.toArray()); - } else { - return RangeTreeMatcher.fromRanges(inverse, cps.toArray()); - } - } - assert ranges.rangesAreSortedAndDisjoint(); - return HybridBitSetMatcher.create(inverse, ranges.toArray(), bitSets.toArray(new CompilationFinalBitSet[bitSets.length()])); - } - - private static void processRanges(ImmutableSortedListOfIntRanges cps, IntRangesBuffer ranges, ObjectArrayBuffer bitSets, int iMin, int iMax, int iMaxBS, int curPlane) { - if (isOverBitSetConversionThreshold(iMaxBS - iMin)) { - addBitSet(cps, bitSets, ranges, iMin, iMaxBS, curPlane); - } else { - addRanges(cps, bitSets, ranges, iMin, iMax); - } - } - - private static boolean isOverBitSetConversionThreshold(int nRanges) { - return nRanges >= TRegexOptions.TRegexRangeToBitSetConversionThreshold; - } - - private static void addRanges(ImmutableSortedListOfIntRanges cps, ObjectArrayBuffer bitSets, IntRangesBuffer ranges, int iMinArg, int iMax) { - if (iMinArg == iMax) { - return; - } - int iMin = iMinArg; - if (!bitSets.isEmpty() && bitSets.peek() != null && highByte(cps.getLo(iMin)) == highByte(ranges.getMax())) { - ranges.appendRangeAllowAdjacent(ranges.getMax() + 1, cps.getHi(iMin)); - iMin++; - } - cps.appendRangesTo(ranges, iMin, iMax); - for (int i = 0; i < iMax - iMinArg; i++) { - bitSets.add(null); - } - } - - private static void addBitSet(ImmutableSortedListOfIntRanges cps, ObjectArrayBuffer bitSets, IntRangesBuffer ranges, int iMin, int iMax, int curPlane) { - assert iMax - iMin > 1; - int curPlaneLo = curPlane << Byte.SIZE; - if (rangeCrossesPlanes(cps, iMin)) { - if (bitSets.isEmpty() || bitSets.peek() == null || highByte(cps.getLo(iMin)) != highByte(ranges.getMax())) { - ranges.appendRangeAllowAdjacent(cps.getLo(iMin), curPlaneLo - 1); - bitSets.add(null); - } else if (highByte(cps.getHi(iMin)) - highByte(cps.getLo(iMin)) > 1) { - ranges.appendRangeAllowAdjacent(ranges.getMax() + 1, curPlaneLo - 1); - bitSets.add(null); - } - } - CompilationFinalBitSet bs; - int bsRangeLo; - int bsRangeHi; - int iMinBS = iMin; - int iMaxBS = iMax; - if (rangeCrossesPlanes(cps, iMax - 1)) { - bs = new CompilationFinalBitSet(0xff); - iMaxBS--; - bs.setRange(lowByte(cps.getLo(iMax - 1)), 0xff); - bsRangeHi = curPlaneLo | 0xff; - } else { - bs = new CompilationFinalBitSet(lowByte(cps.getHi(iMax - 1))); - bsRangeHi = cps.getHi(iMax - 1); - } - if (rangeCrossesPlanes(cps, iMin)) { - assert highByte(cps.getHi(iMin)) == curPlane; - iMinBS++; - bs.setRange(0, lowByte(cps.getHi(iMin))); - bsRangeLo = curPlaneLo; - } else { - bsRangeLo = cps.getLo(iMin); - } - for (int i = iMinBS; i < iMaxBS; i++) { - assert highByte(cps.getLo(i)) == curPlane && highByte(cps.getHi(i)) == curPlane; - bs.setRange(lowByte(cps.getLo(i)), lowByte(cps.getHi(i))); - } - bitSets.add(bs); - ranges.appendRangeAllowAdjacent(bsRangeLo, bsRangeHi); - } - - private static boolean rangeCrossesPlanes(ImmutableSortedListOfIntRanges ranges, int i) { - return highByte(ranges.getLo(i)) != highByte(ranges.getHi(i)); + return BitSetMatcher.create(inverse, highByte, bs.toLongArray()); } @TruffleBoundary diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CodePointSet.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CodePointSet.java index 80d88e2fdfcc..b36e3d2b2d40 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CodePointSet.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CodePointSet.java @@ -44,27 +44,22 @@ import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; import com.oracle.truffle.regex.tregex.buffer.IntRangesBuffer; +import com.oracle.truffle.regex.tregex.string.Encodings.Encoding; import com.oracle.truffle.regex.tregex.util.json.Json; import com.oracle.truffle.regex.tregex.util.json.JsonConvertible; import com.oracle.truffle.regex.tregex.util.json.JsonValue; public final class CodePointSet extends ImmutableSortedListOfIntRanges implements Comparable, JsonConvertible { - public static final int MIN_VALUE = Character.MIN_CODE_POINT; - public static final int MAX_VALUE = Character.MAX_CODE_POINT; private static final CodePointSet CONSTANT_EMPTY = new CodePointSet(new int[0]); - private static final CodePointSet CONSTANT_FULL = new CodePointSet(new int[]{MIN_VALUE, MAX_VALUE}); private static final CodePointSet[] CONSTANT_ASCII = new CodePointSet[128]; - private static final CodePointSet[] CONSTANT_INVERSE_ASCII = new CodePointSet[128]; private static final CodePointSet[] CONSTANT_CASE_FOLD_ASCII = new CodePointSet[26]; static { CONSTANT_ASCII[0] = new CodePointSet(new int[]{0, 0}); - CONSTANT_INVERSE_ASCII[0] = new CodePointSet(new int[]{1, Character.MAX_CODE_POINT}); for (int i = 1; i < 128; i++) { CONSTANT_ASCII[i] = new CodePointSet(new int[]{i, i}); - CONSTANT_INVERSE_ASCII[i] = new CodePointSet(new int[]{0, i - 1, i + 1, Character.MAX_CODE_POINT}); } for (int i = 'A'; i <= 'Z'; i++) { CONSTANT_CASE_FOLD_ASCII[i - 'A'] = new CodePointSet(new int[]{i, i, Character.toLowerCase(i), Character.toLowerCase(i)}); @@ -73,7 +68,7 @@ public final class CodePointSet extends ImmutableSortedListOfIntRanges implement private CodePointSet(int[] ranges) { super(ranges); - assert ranges.length == 0 || ranges[0] >= MIN_VALUE && ranges[ranges.length - 1] <= MAX_VALUE; + assert ranges.length == 0 || ranges[0] >= 0 && ranges[ranges.length - 1] >= 0; } public int[] getRanges() { @@ -84,10 +79,6 @@ public static CodePointSet getEmpty() { return CONSTANT_EMPTY; } - public static CodePointSet getFull() { - return CONSTANT_FULL; - } - public static CodePointSet createNoDedup(int... ranges) { return new CodePointSet(ranges); } @@ -123,14 +114,8 @@ private static CodePointSet checkConstants(int[] ranges, int length) { if (ranges[0] == ranges[1] && ranges[0] < 128) { return CONSTANT_ASCII[ranges[0]]; } - if (ranges[0] == Character.MIN_CODE_POINT && ranges[1] == Character.MAX_CODE_POINT) { - return CONSTANT_FULL; - } } if (length == 4) { - if (ranges[0] == Character.MIN_CODE_POINT && ranges[3] == Character.MAX_CODE_POINT && ranges[2] <= 128 && ranges[1] + 2 == ranges[2]) { - return CONSTANT_INVERSE_ASCII[ranges[1] + 1]; - } if (ranges[0] == ranges[1] && ranges[0] >= 'A' && ranges[0] <= 'Z' && ranges[2] == ranges[3] && ranges[2] == (ranges[0] | 0x20)) { return CONSTANT_CASE_FOLD_ASCII[ranges[0] - 'A']; } @@ -149,12 +134,6 @@ public CodePointSet createEmpty() { return getEmpty(); } - @SuppressWarnings("unchecked") - @Override - public CodePointSet createFull() { - return getFull(); - } - @SuppressWarnings("unchecked") @Override public CodePointSet create(RangesBuffer buffer) { @@ -169,32 +148,41 @@ public boolean equalsBuffer(RangesBuffer buffer) { return ranges.length == buf.length() && rangesEqual(ranges, buf.getBuffer(), ranges.length); } + @SuppressWarnings("unchecked") @Override - public int getMinValue() { - return MIN_VALUE; + public CodePointSet createInverse(Encoding encoding) { + return createInverse(this, encoding); } - @Override - public int getMaxValue() { - return MAX_VALUE; + public static CodePointSet createInverse(SortedListOfRanges src, Encoding encoding) { + if (src.matchesNothing()) { + return encoding.getFullSet(); + } + return new CodePointSet(createInverseArray(src, encoding)); } @SuppressWarnings("unchecked") @Override - public CodePointSet createInverse() { - return createInverse(this); - } - - public static CodePointSet createInverse(SortedListOfRanges src) { - assert src.getMinValue() == Character.MIN_CODE_POINT; - assert src.getMaxValue() == Character.MAX_CODE_POINT; - if (src.matchesNothing()) { - return getFull(); + public T createIntersectionSingleRange(T o) { + assert size() == 1 && !o.isEmpty(); + if (getMin() <= o.getMin() && getMax() >= o.getMax()) { + return o; + } + int iLo = 0; + int iHi = o.size() - 1; + while (iLo < o.size() && o.getHi(iLo) < getMin()) { + iLo++; + } + while (iHi >= 0 && o.getLo(iHi) > getMax()) { + iHi--; } - if (src.matchesSingleAscii()) { - return CONSTANT_INVERSE_ASCII[src.getMin()]; + if (iHi < iLo) { + return (T) createEmpty(); } - return new CodePointSet(createInverseArray(src)); + int[] intersection = Arrays.copyOfRange(((CodePointSet) o).ranges, iLo * 2, (iHi + 1) * 2); + intersection[0] = Math.max(intersection[0], getMin()); + intersection[intersection.length - 1] = Math.min(intersection[intersection.length - 1], getMax()); + return (T) create(intersection); } @Override @@ -202,24 +190,6 @@ public int compareTo(CodePointSet o) { if (this == o) { return 0; } - if (matchesEverything()) { - if (o.matchesEverything()) { - return 0; - } - return 1; - } - if (matchesNothing()) { - if (o.matchesNothing()) { - return 0; - } - return -1; - } - if (o.matchesEverything()) { - return -1; - } - if (o.matchesNothing()) { - return 1; - } int cmp = size() - o.size(); if (cmp != 0) { return cmp; @@ -281,17 +251,37 @@ public int[] toArray() { return getRanges(); } - public char[] inverseToCharArray() { - char[] array = new char[inverseValueCount()]; + public byte[] inverseToByteArray(Encoding encoding) { + byte[] array = new byte[inverseValueCount(encoding)]; + int index = 0; + int lastHi = -1; + for (int i = 0; i < size(); i++) { + for (int j = lastHi + 1; j < getLo(i); j++) { + assert j <= 0xff; + array[index++] = (byte) j; + } + lastHi = getHi(i); + } + for (int j = lastHi + 1; j <= encoding.getMaxValue(); j++) { + assert j <= 0xff; + array[index++] = (byte) j; + } + return array; + } + + public char[] inverseToCharArray(Encoding encoding) { + char[] array = new char[inverseValueCount(encoding)]; int index = 0; int lastHi = -1; for (int i = 0; i < size(); i++) { for (int j = lastHi + 1; j < getLo(i); j++) { + assert j <= Character.MAX_VALUE; array[index++] = (char) j; } lastHi = getHi(i); } - for (int j = lastHi + 1; j <= getMaxValue(); j++) { + for (int j = lastHi + 1; j <= encoding.getMaxValue(); j++) { + assert j <= Character.MAX_VALUE; array[index++] = (char) j; } return array; diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CodePointSetAccumulator.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CodePointSetAccumulator.java index 88af2d356892..b70ef8ff2cea 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CodePointSetAccumulator.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CodePointSetAccumulator.java @@ -56,7 +56,7 @@ public IntRangesBuffer get() { return acc; } - private IntRangesBuffer getTmp() { + public IntRangesBuffer getTmp() { if (tmp == null) { tmp = new IntRangesBuffer(); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CodePointSetBMPView.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CodePointSetBMPView.java deleted file mode 100644 index b124693b0c52..000000000000 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CodePointSetBMPView.java +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * The Universal Permissive License (UPL), Version 1.0 - * - * Subject to the condition set forth below, permission is hereby granted to any - * person obtaining a copy of this software, associated documentation and/or - * data (collectively the "Software"), free of charge and under any and all - * copyright rights in the Software, and any and all patent rights owned or - * freely licensable by each licensor hereunder covering either (i) the - * unmodified Software as contributed to or provided by such licensor, or (ii) - * the Larger Works (as defined below), to deal in both - * - * (a) the Software, and - * - * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if - * one is included with the Software each a "Larger Work" to which the Software - * is contributed by such licensors), - * - * without restriction, including without limitation the rights to copy, create - * derivative works of, display, perform, and distribute the Software and make, - * use, sell, offer for sale, import, export, have made, and have sold the - * Software and the Larger Work(s), and to sublicense the foregoing rights on - * either these or other terms. - * - * This license is subject to the following condition: - * - * The above copyright notice and either this complete permission notice or at a - * minimum a reference to the UPL must be included in all copies or substantial - * portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -package com.oracle.truffle.regex.charset; - -import java.util.Arrays; - -import com.oracle.truffle.regex.tregex.buffer.IntRangesBuffer; - -public final class CodePointSetBMPView extends ImmutableSortedListOfIntRanges { - - public static final int MIN_VALUE = Character.MIN_VALUE; - public static final int MAX_VALUE = Character.MAX_VALUE; - - private static final CodePointSetBMPView EMPTY = new CodePointSetBMPView(new int[]{}); - private static final CodePointSetBMPView FULL = new CodePointSetBMPView(new int[]{MIN_VALUE, MAX_VALUE}); - - private final int size; - - private CodePointSetBMPView(int[] ranges) { - super(ranges); - this.size = viewSize(ranges); - } - - private static int viewSize(int[] ranges) { - if (ranges.length == 0) { - return 0; - } - if (ranges[ranges.length - 2] > MAX_VALUE) { - assert ranges[ranges.length - 2] == Character.MAX_VALUE + 1 && ranges[ranges.length - 1] == Character.MAX_CODE_POINT; - return (ranges.length / 2) - 1; - } else { - return ranges.length / 2; - } - } - - public static CodePointSetBMPView create(IntRangesBuffer buf) { - return new CodePointSetBMPView(buf.toArray()); - } - - public static CodePointSetBMPView create(CodePointSet cps) { - return new CodePointSetBMPView(cps.getRanges()); - } - - @Override - public int size() { - return size; - } - - @Override - public int getLo(int i) { - int lo = super.getLo(i); - assert lo <= MAX_VALUE; - return lo; - } - - @Override - public int getHi(int i) { - int hi = super.getHi(i); - assert hi <= MAX_VALUE || hi == Character.MAX_CODE_POINT; - return (char) hi; - } - - @Override - public int[] toArray() { - return size * 2 == ranges.length ? ranges : Arrays.copyOf(ranges, size * 2); - } - - @SuppressWarnings("unchecked") - @Override - public CodePointSetBMPView createEmpty() { - return EMPTY; - } - - @SuppressWarnings("unchecked") - @Override - public CodePointSetBMPView createFull() { - return FULL; - } - - @SuppressWarnings("unchecked") - @Override - public CodePointSetBMPView create(RangesBuffer buffer) { - assert buffer instanceof IntRangesBuffer; - return create((IntRangesBuffer) buffer); - } - - @Override - public int getMinValue() { - return MIN_VALUE; - } - - @Override - public int getMaxValue() { - return MAX_VALUE; - } - - @SuppressWarnings("unchecked") - @Override - public CodePointSetBMPView createInverse() { - if (isEmpty()) { - return createFull(); - } - return new CodePointSetBMPView(createInverseArray(this)); - } - - @Override - public boolean equalsBuffer(RangesBuffer buffer) { - assert buffer instanceof IntRangesBuffer; - IntRangesBuffer buf = (IntRangesBuffer) buffer; - if (isEmpty()) { - return buf.isEmpty(); - } - return size() == buf.length() * 2 && rangesEqual(ranges, buf.getBuffer(), (size() * 2) - 1) && getMax() == (char) buf.getMax(); - } -} diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CompressedCodePointSet.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CompressedCodePointSet.java new file mode 100644 index 000000000000..6ab49e0d8233 --- /dev/null +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/CompressedCodePointSet.java @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * The Universal Permissive License (UPL), Version 1.0 + * + * Subject to the condition set forth below, permission is hereby granted to any + * person obtaining a copy of this software, associated documentation and/or + * data (collectively the "Software"), free of charge and under any and all + * copyright rights in the Software, and any and all patent rights owned or + * freely licensable by each licensor hereunder covering either (i) the + * unmodified Software as contributed to or provided by such licensor, or (ii) + * the Larger Works (as defined below), to deal in both + * + * (a) the Software, and + * + * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if + * one is included with the Software each a "Larger Work" to which the Software + * is contributed by such licensors), + * + * without restriction, including without limitation the rights to copy, create + * derivative works of, display, perform, and distribute the Software and make, + * use, sell, offer for sale, import, export, have made, and have sold the + * Software and the Larger Work(s), and to sublicense the foregoing rights on + * either these or other terms. + * + * This license is subject to the following condition: + * + * The above copyright notice and either this complete permission notice or at a + * minimum a reference to the UPL must be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package com.oracle.truffle.regex.charset; + +import static com.oracle.truffle.regex.util.BitSets.highByte; +import static com.oracle.truffle.regex.util.BitSets.lowByte; + +import java.util.Arrays; + +import com.oracle.truffle.api.CompilerDirectives.CompilationFinal; +import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; +import com.oracle.truffle.regex.tregex.TRegexOptions; +import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; +import com.oracle.truffle.regex.tregex.buffer.IntRangesBuffer; +import com.oracle.truffle.regex.tregex.buffer.ObjectArrayBuffer; +import com.oracle.truffle.regex.tregex.matchers.HybridBitSetMatcher; +import com.oracle.truffle.regex.tregex.nodes.dfa.AllTransitionsInOneTreeMatcher; +import com.oracle.truffle.regex.util.BitSets; + +/** + * Compressed variant of {@link CodePointSet}. Stores clusters of small ranges as bit sets, if + * possible. Every range in this set's sorted list of ranges may be associated to a bit set, which + * stores the more "fine-grained" ranges inside the larger range. + * + * Example: + * + *
+ * Character set:
+ * [0x02, 0x04, 0x06, 0x1000, 0x1020-0x1030]
+ *
+ * Resulting compressed code point set:
+ * - ranges:   [0x02-0x06,          0x1000, 0x1020-0x1030]
+ * - bit-sets: [[0x02, 0x04, 0x06], null,   null         ]
+ * 
+ * + * @see HybridBitSetMatcher + * @see AllTransitionsInOneTreeMatcher + */ +public final class CompressedCodePointSet { + + private static final long[][] NO_BITSETS = {}; + + @CompilationFinal(dimensions = 1) private final int[] ranges; + @CompilationFinal(dimensions = 2) private final long[][] bitSets; + + private CompressedCodePointSet(int[] ranges, long[][] bitSets) { + this.ranges = ranges; + this.bitSets = bitSets; + assert this.bitSets == null || this.bitSets.length == this.ranges.length / 2; + } + + public int[] getRanges() { + return ranges; + } + + public boolean hasBitSets() { + return bitSets != null; + } + + public long[][] getBitSets() { + return bitSets; + } + + public int getLo(int i) { + return ranges[i * 2]; + } + + public int getHi(int i) { + return ranges[(i * 2) + 1]; + } + + public int size() { + return ranges.length / 2; + } + + public boolean hasBitSet(int i) { + return hasBitSets() && bitSets[i] != null; + } + + public long[] getBitSet(int i) { + return bitSets[i]; + } + + public static CompressedCodePointSet create(ImmutableSortedListOfIntRanges cps, CompilationBuffer compilationBuffer) { + int size = cps.size(); + if (size < TRegexOptions.TRegexRangeToBitSetConversionThreshold) { + return new CompressedCodePointSet(cps.toArray(), null); + } + if (highByte(cps.getMin()) == highByte(cps.getMax())) { + return convertToBitSet(cps); + } + assert size >= 1; + IntRangesBuffer ranges = compilationBuffer.getIntRangesBuffer1(); + ObjectArrayBuffer bitSets = compilationBuffer.getObjectBuffer1(); + // index of lowest range on current plane + int lowestOCP = 0; + int curPlane = highByte(cps.getHi(0)); + for (int i = 0; i < size; i++) { + if (highByte(cps.getLo(i)) != curPlane) { + processRanges(cps, ranges, bitSets, lowestOCP, i, i, curPlane); + curPlane = highByte(cps.getLo(i)); + lowestOCP = i; + } + if (highByte(cps.getHi(i)) != curPlane) { + if (lowestOCP != i) { + processRanges(cps, ranges, bitSets, lowestOCP, i, i + 1, curPlane); + } + curPlane = highByte(cps.getHi(i)); + lowestOCP = i; + } + } + processRanges(cps, ranges, bitSets, lowestOCP, size, size, curPlane); + if (bitSets.isEmpty()) { + return new CompressedCodePointSet(cps.toArray(), null); + } + assert ranges.rangesAreSortedAndDisjoint(); + return new CompressedCodePointSet(ranges.toArray(), bitSets.toArray(CompressedCodePointSet.NO_BITSETS)); + } + + private static CompressedCodePointSet convertToBitSet(ImmutableSortedListOfIntRanges cps) { + int highByte = highByte(cps.getMin()); + long[] bs = BitSets.createBitSetArray(cps.getMax() + 1); + for (int i = 0; i < cps.size(); i++) { + assert highByte(cps.getLo(i)) == highByte && highByte(cps.getHi(i)) == highByte; + BitSets.setRange(bs, lowByte(cps.getLo(i)), lowByte(cps.getHi(i))); + } + return new CompressedCodePointSet(new int[]{cps.getMin(), cps.getMax()}, new long[][]{bs}); + } + + private static void processRanges(ImmutableSortedListOfIntRanges cps, IntRangesBuffer ranges, ObjectArrayBuffer bitSets, int iMin, int iMax, int iMaxBS, int curPlane) { + if (iMaxBS - iMin >= TRegexOptions.TRegexRangeToBitSetConversionThreshold) { + addBitSet(cps, bitSets, ranges, iMin, iMaxBS, curPlane); + } else { + addRanges(cps, bitSets, ranges, iMin, iMax); + } + } + + private static void addRanges(ImmutableSortedListOfIntRanges cps, ObjectArrayBuffer bitSets, IntRangesBuffer ranges, int iMinArg, int iMax) { + if (iMinArg == iMax) { + return; + } + int iMin = iMinArg; + if (!bitSets.isEmpty() && bitSets.peek() != null && highByte(cps.getLo(iMin)) == highByte(ranges.getMax())) { + ranges.appendRangeAllowAdjacent(ranges.getMax() + 1, cps.getHi(iMin)); + iMin++; + } + cps.appendRangesTo(ranges, iMin, iMax); + for (int i = 0; i < iMax - iMinArg; i++) { + bitSets.add(null); + } + } + + private static void addBitSet(ImmutableSortedListOfIntRanges cps, ObjectArrayBuffer bitSets, IntRangesBuffer ranges, int iMin, int iMax, int curPlane) { + assert iMax - iMin > 1; + int curPlaneLo = curPlane << Byte.SIZE; + if (rangeCrossesPlanes(cps, iMin)) { + if (bitSets.isEmpty() || bitSets.peek() == null || highByte(cps.getLo(iMin)) != highByte(ranges.getMax())) { + ranges.appendRangeAllowAdjacent(cps.getLo(iMin), curPlaneLo - 1); + bitSets.add(null); + } else if (highByte(cps.getHi(iMin)) - highByte(cps.getLo(iMin)) > 1) { + ranges.appendRangeAllowAdjacent(ranges.getMax() + 1, curPlaneLo - 1); + bitSets.add(null); + } + } + long[] bs; + int bsRangeLo; + int bsRangeHi; + int iMinBS = iMin; + int iMaxBS = iMax; + if (rangeCrossesPlanes(cps, iMax - 1)) { + bs = BitSets.createBitSetArray(0x100); + iMaxBS--; + BitSets.setRange(bs, lowByte(cps.getLo(iMax - 1)), 0xff); + bsRangeHi = curPlaneLo | 0xff; + } else { + bs = BitSets.createBitSetArray(lowByte(cps.getHi(iMax - 1)) + 1); + bsRangeHi = cps.getHi(iMax - 1); + } + if (rangeCrossesPlanes(cps, iMin)) { + assert highByte(cps.getHi(iMin)) == curPlane; + iMinBS++; + BitSets.setRange(bs, 0, lowByte(cps.getHi(iMin))); + bsRangeLo = curPlaneLo; + } else { + bsRangeLo = cps.getLo(iMin); + } + for (int i = iMinBS; i < iMaxBS; i++) { + assert highByte(cps.getLo(i)) == curPlane && highByte(cps.getHi(i)) == curPlane; + BitSets.setRange(bs, lowByte(cps.getLo(i)), lowByte(cps.getHi(i))); + } + bitSets.add(bs); + ranges.appendRangeAllowAdjacent(bsRangeLo, bsRangeHi); + } + + private static boolean rangeCrossesPlanes(ImmutableSortedListOfIntRanges ranges, int i) { + return highByte(ranges.getLo(i)) != highByte(ranges.getHi(i)); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj instanceof CompressedCodePointSet) { + return Arrays.equals(ranges, ((CompressedCodePointSet) obj).ranges) && Arrays.deepEquals(bitSets, ((CompressedCodePointSet) obj).bitSets); + } + return false; + } + + @Override + public int hashCode() { + return Arrays.hashCode(ranges) * 31 + Arrays.deepHashCode(bitSets); + } + + @TruffleBoundary + @Override + public String toString() { + if (bitSets == null) { + return "[" + CharMatchers.rangesToString(ranges) + "]"; + } + StringBuilder sb = new StringBuilder("["); + for (int i = 0; i < ranges.length; i += 2) { + if (bitSets[i / 2] == null) { + sb.append(Range.toString(ranges[i], ranges[i + 1])); + } else { + sb.append("[range: ").append(Range.toString(ranges[i], ranges[i + 1])).append(", bs: ").append(BitSets.toString(bitSets[i / 2])).append("]"); + } + } + return sb.append("]").toString(); + } +} diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/Constants.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/Constants.java index 1b42fd339dc8..1e1a5dcb5f0b 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/Constants.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/Constants.java @@ -44,12 +44,23 @@ public final class Constants { public static final int MAX_CODE_POINT = Character.MAX_CODE_POINT; + // 0x00 - 0x7f + public static final CodePointSet ASCII_RANGE = CodePointSet.createNoDedup(0x00, 0x7f); + // 0x00 - 0xff public static final CodePointSet BYTE_RANGE = CodePointSet.createNoDedup(0x00, 0xff); // 0x00 - 0xffff public static final CodePointSet BMP_RANGE = CodePointSet.createNoDedup(0x00, 0xffff); + public static final CodePointSet BMP_RANGE_WITHOUT_LATIN1 = CodePointSet.createNoDedup(0x100, 0xffff); + + // range of code points that need two bytes in UTF-8 + public static final CodePointSet UTF8_TWO_BYTE_RANGE = CodePointSet.createNoDedup(0x80, 0x7ff); + + // range of code points that need three bytes in UTF-8 + public static final CodePointSet UTF8_THREE_BYTE_RANGE = CodePointSet.createNoDedup(0x800, 0xffff); + public static final CodePointSet BMP_WITHOUT_SURROGATES = CodePointSet.createNoDedup(0x0000, 0xd7ff, 0xe000, 0xffff); public static final CodePointSet ASTRAL_SYMBOLS = CodePointSet.createNoDedup(0x010000, 0x10ffff); @@ -60,6 +71,8 @@ public final class Constants { public static final CodePointSet TRAIL_SURROGATES = CodePointSet.createNoDedup(0xdc00, 0xdfff); + public static final CodePointSet ASTRAL_SYMBOLS_AND_LONE_SURROGATES = CodePointSet.createNoDedup(0xd800, 0xdfff, 0x010000, 0x10ffff); + // [0-9] public static final CodePointSet DIGITS = CodePointSet.createNoDedup('0', '9'); @@ -206,7 +219,7 @@ public final class Constants { 0x000e, 0x2027, 0x202a, 0x10ffff); - public static final CodePointSet DOT_ALL = CodePointSet.getFull(); + public static final CodePointSet DOT_ALL = CodePointSet.createNoDedup(0x0000, 0x10ffff); // [A-Fa-f0-9] public static final CodePointSet HEX_CHARS = CodePointSet.createNoDedup( diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/ImmutableSortedListOfIntRanges.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/ImmutableSortedListOfIntRanges.java index 91a6700eaa46..9bad781706d5 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/ImmutableSortedListOfIntRanges.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/ImmutableSortedListOfIntRanges.java @@ -43,6 +43,8 @@ import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; import com.oracle.truffle.regex.tregex.buffer.IntRangesBuffer; +import com.oracle.truffle.regex.tregex.string.Encodings.Encoding; +import com.oracle.truffle.regex.util.BitSets; public abstract class ImmutableSortedListOfIntRanges implements ImmutableSortedListOfRanges { @@ -110,27 +112,27 @@ public void appendRangesTo(RangesBuffer buffer, int startIndex, int endIndex) { * Returns {@code true} iff not all values of this range set have the same high byte, but that * would be the case in the inverse of this range set. */ - public boolean inverseIsSameHighByte() { + public boolean inverseIsSameHighByte(Encoding encoding) { if (isEmpty()) { return false; } - if (CharMatchers.highByte(getMin()) == CharMatchers.highByte(getMax())) { + if (BitSets.highByte(getMin()) == BitSets.highByte(getMax())) { return false; } - return matchesMinAndMax() && CharMatchers.highByte(getHi(0) + 1) == CharMatchers.highByte(getLo(size() - 1) - 1); + return matchesMinAndMax(encoding) && BitSets.highByte(getHi(0) + 1) == BitSets.highByte(getLo(size() - 1) - 1); } - protected static int[] createInverseArray(SortedListOfRanges src) { - int[] invRanges = new int[src.sizeOfInverse() * 2]; + protected static int[] createInverseArray(SortedListOfRanges src, Encoding encoding) { + int[] invRanges = new int[src.sizeOfInverse(encoding) * 2]; int i = 0; - if (src.getMin() > src.getMinValue()) { - setRange(invRanges, i++, src.getMinValue(), src.getMin() - 1); + if (src.getMin() > encoding.getMinValue()) { + setRange(invRanges, i++, encoding.getMinValue(), src.getMin() - 1); } for (int ia = 1; ia < src.size(); ia++) { setRange(invRanges, i++, src.getHi(ia - 1) + 1, src.getLo(ia) - 1); } - if (src.getMax() < src.getMaxValue()) { - setRange(invRanges, i++, src.getMax() + 1, src.getMaxValue()); + if (src.getMax() < encoding.getMaxValue()) { + setRange(invRanges, i++, src.getMax() + 1, encoding.getMaxValue()); } return invRanges; } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/ImmutableSortedListOfRanges.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/ImmutableSortedListOfRanges.java index 56b60074bee3..1aebf226e388 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/ImmutableSortedListOfRanges.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/ImmutableSortedListOfRanges.java @@ -43,6 +43,7 @@ import java.util.Iterator; import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; +import com.oracle.truffle.regex.tregex.string.Encodings.Encoding; /** * Extensions of {@link SortedListOfRanges} specific to immutable implementations. Any methods of @@ -55,21 +56,16 @@ public interface ImmutableSortedListOfRanges extends SortedListOfRanges, Iterabl */ T createEmpty(); - /** - * Returns [{@link #getMinValue()} {@link #getMaxValue()}]. - */ - T createFull(); - /** * Returns an immutable equivalent of the given {@code buffer}. */ T create(RangesBuffer buffer); /** - * Returns a list containing all values of [{@link #getMinValue()} {@link #getMaxValue()}] - * not contained in this list. + * Returns a list containing all values of [{@link Encoding#getMinValue()} + * {@link Encoding#getMaxValue()}] not contained in this list. */ - T createInverse(); + T createInverse(Encoding encoding); /** * Returns a buffer from the given {@code compilationBuffer} that is compatible with this list's @@ -108,11 +104,19 @@ default T createIntersection(T o, Compil } /** - * Converts {@code target} to the intersection of this list and {@code o} and returns an - * immutable equivalent. + * Returns the intersection of this list and {@code o}, using {@code tmp} as working buffer. */ @SuppressWarnings("unchecked") default T createIntersection(T o, RangesBuffer tmp) { + if (isEmpty() || o.isEmpty()) { + return createEmpty(); + } + if (size() == 1) { + return createIntersectionSingleRange(o); + } + if (o.size() == 1) { + return o.createIntersectionSingleRange((T) this); + } tmp.clear(); for (int ia = 0; ia < size(); ia++) { int search = o.binarySearch(getLo(ia)); @@ -138,6 +142,8 @@ default T createIntersection(T o, Ranges return create(tmp); } + T createIntersectionSingleRange(T o); + /** * Returns the result of the subtraction of {@code o} from this list. Uses * {@link #getBuffer1(CompilationBuffer)}. @@ -222,11 +228,11 @@ default IntersectAndSubtractResult in // no intersection possible return new IntersectAndSubtractResult<>((T) this, o, createEmpty()); } - if (matchesEverything()) { - return new IntersectAndSubtractResult<>(o.createInverse(), createEmpty(), o); + if (matchesEverything(compilationBuffer.getEncoding())) { + return new IntersectAndSubtractResult<>(o.createInverse(compilationBuffer.getEncoding()), createEmpty(), o); } - if (o.matchesEverything()) { - return new IntersectAndSubtractResult<>(createEmpty(), createInverse(), (T) this); + if (o.matchesEverything(compilationBuffer.getEncoding())) { + return new IntersectAndSubtractResult<>(createEmpty(), createInverse(compilationBuffer.getEncoding()), (T) this); } if (equals(o)) { return new IntersectAndSubtractResult<>(createEmpty(), createEmpty(), (T) this); @@ -380,10 +386,10 @@ default T union(T o, CompilationBuffer c */ @SuppressWarnings("unchecked") default T union(T o, RangesBuffer target) { - if (matchesNothing() || o.matchesEverything()) { + if (matchesNothing() || o.size() == 1 && o.getMin() <= getMin() && o.getMax() >= getMax()) { return o; } - if (matchesEverything() || o.matchesNothing()) { + if (o.matchesNothing() || size() == 1 && getMin() <= o.getMin() && getMax() >= o.getMax()) { return (T) this; } SortedListOfRanges.union(this, o, target); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/SortedListOfRanges.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/SortedListOfRanges.java index aeda001c7bbb..d3c3a8e4154b 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/SortedListOfRanges.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/SortedListOfRanges.java @@ -42,6 +42,7 @@ import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; import com.oracle.truffle.regex.chardata.CharacterSet; +import com.oracle.truffle.regex.tregex.string.Encodings.Encoding; /** * A storage-agnostic implementation of a sorted list of disjoint integer ranges with inclusive @@ -64,16 +65,6 @@ public interface SortedListOfRanges extends CharacterSet { */ int size(); - /** - * Returns the minimum value that may be contained in an instance of this list. - */ - int getMinValue(); - - /** - * Returns the maximum value that may be contained in an instance of this list. - */ - int getMaxValue(); - /** * Append all ranges from {@code startIndex} (inclusive) to {@code endIndex} (exclusive) to the * given {@code buffer}. The caller is responsible for not violating the target buffer's @@ -103,13 +94,13 @@ default int size(int i) { /** * Returns the number of disjoint ranges contained in the inverse (as defined by - * {@link ImmutableSortedListOfRanges#createInverse()}) of this list. + * {@link ImmutableSortedListOfRanges#createInverse(Encoding)}) of this list. */ - default int sizeOfInverse() { + default int sizeOfInverse(Encoding encoding) { if (isEmpty()) { return 1; } - return (getMin() == getMinValue() ? 0 : 1) + size() - (getMax() == getMaxValue() ? 1 : 0); + return (getMin() == encoding.getMinValue() ? 0 : 1) + size() - (getMax() == encoding.getMaxValue() ? 1 : 0); } /** @@ -132,18 +123,18 @@ default int getMax() { * Returns the smallest value contained in the inverse of this set. Must not be called on empty * or full sets. */ - default int inverseGetMin() { - assert !isEmpty() && !matchesEverything(); - return getMin() == getMinValue() ? getHi(0) + 1 : getMinValue(); + default int inverseGetMin(Encoding encoding) { + assert !isEmpty() && !matchesEverything(encoding); + return getMin() == encoding.getMinValue() ? getHi(0) + 1 : encoding.getMinValue(); } /** * Returns the largest value contained in the inverse of this set. Must not be called on empty * or full sets. */ - default int inverseGetMax() { - assert !isEmpty() && !matchesEverything(); - return getMax() == getMaxValue() ? getLo(size() - 1) - 1 : getMaxValue(); + default int inverseGetMax(Encoding encoding) { + assert !isEmpty() && !matchesEverything(encoding); + return getMax() == encoding.getMaxValue() ? getLo(size() - 1) - 1 : encoding.getMaxValue(); } /** @@ -633,10 +624,11 @@ default boolean matchesSingleAscii() { } /** - * Returns {@code true} iff this set contains {@link #getMinValue()} and {@link #getMaxValue()}. + * Returns {@code true} iff this set contains {@link Encoding#getMinValue()} and + * {@link Encoding#getMaxValue()}. */ - default boolean matchesMinAndMax() { - return matchesSomething() && getMin() == getMinValue() && getMax() == getMaxValue(); + default boolean matchesMinAndMax(Encoding encoding) { + return matchesSomething() && getMin() == encoding.getMinValue() && getMax() == encoding.getMaxValue(); } /** @@ -692,20 +684,20 @@ default boolean valueCountMax(int cmp) { } /** - * Returns the total number of values (from {@link #getMinValue()} to {@link #getMaxValue()}) - * not contained in this list. + * Returns the total number of values (from {@link Encoding#getMinValue()} to + * {@link Encoding#getMaxValue()}) not contained in this list. */ - default int inverseValueCount() { - return (getMaxValue() - getMinValue()) + 1 - valueCount(); + default int inverseValueCount(Encoding encoding) { + return (encoding.getMaxValue() - encoding.getMinValue()) + 1 - valueCount(); } /** - * Returns {@code true} if this list is equal to [{@link #getMinValue()} {@link #getMaxValue()} - * ]. + * Returns {@code true} if this list is equal to [{@link Encoding#getMinValue()} + * {@link Encoding#getMaxValue()} ]. */ - default boolean matchesEverything() { + default boolean matchesEverything(Encoding encoding) { // ranges should be consolidated to one - return size() == 1 && getLo(0) == getMinValue() && getHi(0) == getMaxValue(); + return size() == 1 && getLo(0) == encoding.getMinValue() && getHi(0) == encoding.getMaxValue(); } default boolean equalsListOfRanges(SortedListOfRanges o) { @@ -752,11 +744,7 @@ default String defaultToString() { if (matchesSingleChar()) { return Range.toString(getLo(0), getHi(0)); } - if (matchesMinAndMax()) { - return "[^" + inverseRangesToString() + "]"; - } else { - return "[" + rangesToString() + "]"; - } + return "[" + rangesToString() + "]"; } @TruffleBoundary @@ -769,20 +757,20 @@ default String rangesToString() { } @TruffleBoundary - default String inverseRangesToString() { + default String inverseRangesToString(Encoding encoding) { StringBuilder sb = new StringBuilder(); if (matchesNothing()) { - sb.append(Range.toString(getMinValue(), getMaxValue())); + sb.append(Range.toString(encoding.getMinValue(), encoding.getMaxValue())); return sb.toString(); } - if (getLo(0) > getMinValue()) { - sb.append(Range.toString(getMinValue(), getLo(0) - 1)); + if (getLo(0) > encoding.getMinValue()) { + sb.append(Range.toString(encoding.getMinValue(), getLo(0) - 1)); } for (int ia = 1; ia < size(); ia++) { sb.append(Range.toString(getHi(ia - 1) + 1, getLo(ia) - 1)); } - if (getHi(size() - 1) < getMaxValue()) { - sb.append(Range.toString(getHi(size() - 1) + 1, getMaxValue())); + if (getHi(size() - 1) < encoding.getMaxValue()) { + sb.append(Range.toString(getHi(size() - 1) + 1, encoding.getMaxValue())); } return sb.toString(); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/literal/LiteralRegexExecRootNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/literal/LiteralRegexExecRootNode.java index a379411cdcee..30dd31b22dcf 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/literal/LiteralRegexExecRootNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/literal/LiteralRegexExecRootNode.java @@ -172,6 +172,14 @@ abstract static class NonEmptyLiteralRegexExecRootNode extends LiteralRegexExecR protected String getLiteral() { return literal.toString(); } + + Object literalContent() { + return literal.content(); + } + + Object maskContent() { + return mask == null ? null : mask.content(); + } } public static final class IndexOfString extends NonEmptyLiteralRegexExecRootNode { @@ -189,7 +197,7 @@ protected String getImplName() { @Override protected RegexResult execute(Object input, int fromIndex) { - int start = indexOfStringNode.execute(input, fromIndex, inputLength(input), literal, mask); + int start = indexOfStringNode.execute(input, fromIndex, inputLength(input), literalContent(), maskContent()); if (start == -1) { return NoMatchResult.getInstance(); } @@ -212,7 +220,7 @@ protected String getImplName() { @Override protected RegexResult execute(Object input, int fromIndex) { - if (fromIndex == 0 && startsWithNode.execute(input, literal, mask)) { + if (fromIndex == 0 && startsWithNode.execute(input, literalContent(), maskContent())) { return resultFactory.createFromStart(0); } else { return NoMatchResult.getInstance(); @@ -238,7 +246,7 @@ protected String getImplName() { @Override protected RegexResult execute(Object input, int fromIndex) { int matchStart = inputLength(input) - literal.encodedLength(); - if ((sticky ? fromIndex == matchStart : fromIndex <= matchStart) && endsWithNode.execute(input, literal, mask)) { + if ((sticky ? fromIndex == matchStart : fromIndex <= matchStart) && endsWithNode.execute(input, literalContent(), maskContent())) { return resultFactory.createFromEnd(inputLength(input)); } else { return NoMatchResult.getInstance(); @@ -261,7 +269,7 @@ protected String getImplName() { @Override protected RegexResult execute(Object input, int fromIndex) { - if (fromIndex == 0 && equalsNode.execute(input, literal, mask)) { + if (fromIndex == 0 && equalsNode.execute(input, literalContent(), maskContent())) { return resultFactory.createFromStart(0); } else { return NoMatchResult.getInstance(); @@ -284,7 +292,7 @@ protected String getImplName() { @Override protected RegexResult execute(Object input, int fromIndex) { - if (regionMatchesNode.execute(input, fromIndex, literal, 0, literal.encodedLength(), mask)) { + if (regionMatchesNode.execute(input, fromIndex, literalContent(), 0, literal.encodedLength(), maskContent())) { return resultFactory.createFromStart(fromIndex); } else { return NoMatchResult.getInstance(); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/TRegexCompilationRequest.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/TRegexCompilationRequest.java index db0f85427840..058ec56ddcb8 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/TRegexCompilationRequest.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/TRegexCompilationRequest.java @@ -40,11 +40,6 @@ */ package com.oracle.truffle.regex.tregex; -import static com.oracle.truffle.regex.tregex.util.DebugUtil.LOG_AUTOMATON_SIZES; -import static com.oracle.truffle.regex.tregex.util.DebugUtil.LOG_BAILOUT_MESSAGES; -import static com.oracle.truffle.regex.tregex.util.DebugUtil.LOG_PHASES; -import static com.oracle.truffle.regex.tregex.util.DebugUtil.LOG_TREGEX_COMPILATIONS; - import java.util.logging.Level; import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; @@ -52,6 +47,7 @@ import com.oracle.truffle.api.TruffleLanguage.Env; import com.oracle.truffle.regex.CompiledRegexObject; import com.oracle.truffle.regex.RegexExecRootNode; +import com.oracle.truffle.regex.RegexFlags; import com.oracle.truffle.regex.RegexLanguage; import com.oracle.truffle.regex.RegexSource; import com.oracle.truffle.regex.UnsupportedRegexException; @@ -80,8 +76,11 @@ import com.oracle.truffle.regex.tregex.parser.ast.RegexAST; import com.oracle.truffle.regex.tregex.parser.ast.visitors.ASTLaTexExportVisitor; import com.oracle.truffle.regex.tregex.parser.ast.visitors.PreCalcResultVisitor; +import com.oracle.truffle.regex.tregex.string.Encodings; +import com.oracle.truffle.regex.tregex.string.Encodings.Encoding; import com.oracle.truffle.regex.tregex.util.DFAExport; import com.oracle.truffle.regex.tregex.util.DebugUtil; +import com.oracle.truffle.regex.tregex.util.Loggers; import com.oracle.truffle.regex.tregex.util.NFAExport; import com.oracle.truffle.regex.tregex.util.json.Json; @@ -97,6 +96,8 @@ public final class TRegexCompilationRequest { private final TRegexCompiler tRegexCompiler; private final RegexSource source; + private final RegexFlags flags; + private final Encoding encoding; private RegexAST ast = null; private PureNFAMap pureNFA = null; private NFA nfa = null; @@ -105,18 +106,24 @@ public final class TRegexCompilationRequest { private TRegexDFAExecutorNode executorNodeForward = null; private TRegexDFAExecutorNode executorNodeBackward = null; private TRegexDFAExecutorNode executorNodeCaptureGroups = null; - private final CompilationBuffer compilationBuffer = new CompilationBuffer(); + private final CompilationBuffer compilationBuffer; TRegexCompilationRequest(TRegexCompiler tRegexCompiler, RegexSource source) { this.tRegexCompiler = tRegexCompiler; this.source = source; + this.flags = RegexFlags.parseFlags(source.getFlags()); + this.encoding = flags.isUnicode() && !tRegexCompiler.getOptions().isUTF16ExplodeAstralSymbols() ? Encodings.UTF_16 : Encodings.UTF_16_RAW; + this.compilationBuffer = new CompilationBuffer(encoding); } TRegexCompilationRequest(TRegexCompiler tRegexCompiler, NFA nfa) { this.tRegexCompiler = tRegexCompiler; this.source = nfa.getAst().getSource(); + this.flags = nfa.getAst().getFlags(); + this.encoding = nfa.getAst().getEncoding(); this.ast = nfa.getAst(); this.nfa = nfa; + this.compilationBuffer = new CompilationBuffer(encoding); } public TRegexExecRootNode getRoot() { @@ -139,7 +146,7 @@ CompiledRegexObject compile() { @TruffleBoundary private RegexExecRootNode compileInternal() { - LOG_TREGEX_COMPILATIONS.finer(() -> String.format("TRegex compiling %s\n%s", DebugUtil.jsStringEscape(source.toString()), new RegexUnifier(source).getUnifiedPattern())); + Loggers.LOG_TREGEX_COMPILATIONS.finer(() -> String.format("TRegex compiling %s\n%s", DebugUtil.jsStringEscape(source.toString()), new RegexUnifier(source).getUnifiedPattern())); RegexParser regexParser = createParser(); phaseStart("Parser"); try { @@ -179,7 +186,7 @@ public TRegexBacktrackingNFAExecutorNode compileBacktrackingExecutor() { for (int i = 0; i < pureNFA.getLookArounds().size(); i++) { PureNFA lookAround = pureNFA.getLookArounds().get(i); if (pureNFA.getASTSubtree(lookAround).asLookAroundAssertion().isLiteral()) { - lookAroundExecutors[i] = new TRegexLiteralLookAroundExecutorNode(pureNFA.getASTSubtree(lookAround).asLookAroundAssertion(), ast.getEncoding(), compilationBuffer); + lookAroundExecutors[i] = new TRegexLiteralLookAroundExecutorNode(pureNFA.getASTSubtree(lookAround).asLookAroundAssertion(), compilationBuffer); } else { lookAroundExecutors[i] = new TRegexBacktrackingNFAExecutorNode(pureNFA, lookAround, lookAroundExecutors, compilationBuffer); } @@ -206,7 +213,7 @@ TRegexExecRootNode.LazyCaptureGroupRegexSearchNode compileLazyDFAExecutor(TRegex debugTraceFinder(); } catch (UnsupportedRegexException e) { phaseEnd("TraceFinder NFA Bailout"); - LOG_BAILOUT_MESSAGES.fine(() -> "TraceFinder: " + e.getReason() + ": " + source); + Loggers.LOG_BAILOUT_MESSAGES.fine(() -> "TraceFinder: " + e.getReason() + ": " + source); // handle with capture group aware DFA, bailout will always happen before // assigning preCalculatedResults } @@ -267,7 +274,7 @@ private void createAST() { } private RegexParser createParser() { - return new RegexParser(source, tRegexCompiler.getOptions(), compilationBuffer); + return new RegexParser(source, flags, encoding, tRegexCompiler.getOptions(), compilationBuffer); } private void createNFA() { @@ -354,24 +361,24 @@ private void debugDFA(DFAGenerator dfa, String debugDumpName) { } private static boolean shouldLogPhases() { - return LOG_PHASES.isLoggable(Level.FINER); + return Loggers.LOG_PHASES.isLoggable(Level.FINER); } private void phaseStart(String phase) { if (shouldLogPhases()) { - LOG_PHASES.finer(phase + " Start"); + Loggers.LOG_PHASES.finer(phase + " Start"); timer.start(); } } private void phaseEnd(String phase) { if (shouldLogPhases()) { - LOG_PHASES.finer(phase + " End, elapsed: " + timer.elapsedToString()); + Loggers.LOG_PHASES.finer(phase + " End, elapsed: " + timer.elapsedToString()); } } private void logAutomatonSizes(RegexExecRootNode result) { - LOG_AUTOMATON_SIZES.finer(() -> Json.obj( + Loggers.LOG_AUTOMATON_SIZES.finer(() -> Json.obj( Json.prop("pattern", source.getPattern().length() > 200 ? source.getPattern().substring(0, 200) + "..." : source.getPattern()), Json.prop("flags", source.getFlags()), Json.prop("props", ast == null ? new RegexProperties() : ast.getProperties()), diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/automaton/StateTransitionCanonicalizer.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/automaton/StateTransitionCanonicalizer.java index 7053ab810a0e..8ae05eadb963 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/automaton/StateTransitionCanonicalizer.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/automaton/StateTransitionCanonicalizer.java @@ -124,7 +124,7 @@ public TB[] run(CompilationBuffer compilationBuffer) { /** * Merges NFA transitions according to their expected character sets as returned - * {@link TransitionBuilder#getMatcherBuilder()}, in the following way:
+ * {@link TransitionBuilder#getCodePointSet()}, in the following way:
*
    *
  • The result of the algorithm is a list of transitions where no two elements have an * intersection in their respective set of expected characters. We initially define the result @@ -289,7 +289,7 @@ private TB[] mergeSameTargets(CompilationBuffer compilationBuffer) { TB last = null; for (TB tb : resultBuffer1) { if (last != null && canMerge(last, tb)) { - last.setMatcherBuilder(last.getMatcherBuilder().union(tb.getMatcherBuilder(), compilationBuffer)); + last.setMatcherBuilder(last.getCodePointSet().union(tb.getCodePointSet(), compilationBuffer)); } else { resultBuffer2.add(tb); last = tb; diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/automaton/TransitionBuilder.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/automaton/TransitionBuilder.java index 03ec145f9724..ccfbb94d7e7a 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/automaton/TransitionBuilder.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/automaton/TransitionBuilder.java @@ -55,7 +55,7 @@ public class TransitionBuilder, S extends AbstractState, T extends AbstractTransition> implements JsonConvertible { private final TransitionSet transitionSet; - private CodePointSet matcherBuilder; + private CodePointSet cps; public TransitionBuilder(T[] transitions, StateSet targetStateSet, CodePointSet matcherBuilder) { this(new TransitionSet<>(transitions, targetStateSet), matcherBuilder); @@ -63,7 +63,7 @@ public TransitionBuilder(T[] transitions, StateSet targetStateSet, CodePo public TransitionBuilder(TransitionSet transitionSet, CodePointSet matcherBuilder) { this.transitionSet = transitionSet; - this.matcherBuilder = matcherBuilder; + this.cps = matcherBuilder; } public TransitionSet getTransitionSet() { @@ -73,17 +73,17 @@ public TransitionSet getTransitionSet() { /** * Represents the character set matched by this transition fragment. */ - public CodePointSet getMatcherBuilder() { - return matcherBuilder; + public CodePointSet getCodePointSet() { + return cps; } - public void setMatcherBuilder(CodePointSet matcherBuilder) { - this.matcherBuilder = matcherBuilder; + public void setMatcherBuilder(CodePointSet cps) { + this.cps = cps; } @TruffleBoundary @Override public JsonValue toJson() { - return Json.obj(Json.prop("matcherBuilder", getMatcherBuilder())); + return Json.obj(Json.prop("matcherBuilder", getCodePointSet())); } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/CharRangesBuffer.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/CharRangesBuffer.java deleted file mode 100644 index 6710e39b60ea..000000000000 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/CharRangesBuffer.java +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * The Universal Permissive License (UPL), Version 1.0 - * - * Subject to the condition set forth below, permission is hereby granted to any - * person obtaining a copy of this software, associated documentation and/or - * data (collectively the "Software"), free of charge and under any and all - * copyright rights in the Software, and any and all patent rights owned or - * freely licensable by each licensor hereunder covering either (i) the - * unmodified Software as contributed to or provided by such licensor, or (ii) - * the Larger Works (as defined below), to deal in both - * - * (a) the Software, and - * - * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if - * one is included with the Software each a "Larger Work" to which the Software - * is contributed by such licensors), - * - * without restriction, including without limitation the rights to copy, create - * derivative works of, display, perform, and distribute the Software and make, - * use, sell, offer for sale, import, export, have made, and have sold the - * Software and the Larger Work(s), and to sublicense the foregoing rights on - * either these or other terms. - * - * This license is subject to the following condition: - * - * The above copyright notice and either this complete permission notice or at a - * minimum a reference to the UPL must be included in all copies or substantial - * portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -package com.oracle.truffle.regex.tregex.buffer; - -import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; -import com.oracle.truffle.regex.charset.RangesBuffer; - -/** - * Extension of {@link CharArrayBuffer} that adds convenience functions for arrays of character - * ranges in the form: - * - *
    - * [
    - *     inclusive lower bound of range 1, inclusive upper bound of range 1,
    - *     inclusive lower bound of range 2, inclusive upper bound of range 2,
    - *     inclusive lower bound of range 3, inclusive upper bound of range 3,
    - *     ...
    - * ]
    - * 
    - */ -public class CharRangesBuffer extends CharArrayBuffer implements RangesBuffer { - - public CharRangesBuffer() { - this(16); - } - - public CharRangesBuffer(int initialSize) { - super(initialSize); - } - - @Override - public int getMinValue() { - return Character.MIN_VALUE; - } - - @Override - public int getMaxValue() { - return Character.MAX_VALUE; - } - - @Override - public int getLo(int i) { - return buf[i * 2]; - } - - @Override - public int getHi(int i) { - return buf[i * 2 + 1]; - } - - @Override - public int size() { - return length() / 2; - } - - @Override - public void appendRange(int lo, int hi) { - assert isEmpty() || leftOf(size() - 1, lo, hi) && !adjacent(size() - 1, lo, hi); - add((char) lo); - add((char) hi); - } - - @Override - public void insertRange(int index, int lo, int hi) { - assert index >= 0 && index < size(); - assert index == 0 || leftOf(index - 1, lo, hi) && !adjacent(index - 1, lo, hi); - assert rightOf(index, lo, hi) && !adjacent(index, lo, hi); - ensureCapacity(length + 2); - int i = index * 2; - System.arraycopy(buf, i, buf, i + 2, length - i); - buf[i] = (char) lo; - buf[i + 1] = (char) hi; - length += 2; - } - - @Override - public void replaceRanges(int fromIndex, int toIndex, int lo, int hi) { - assert fromIndex >= 0 && fromIndex < toIndex && toIndex >= 0 && toIndex <= size(); - assert fromIndex == 0 || leftOf(fromIndex - 1, lo, hi) && !adjacent(fromIndex - 1, lo, hi); - assert toIndex == size() || rightOf(toIndex, lo, hi) && !adjacent(toIndex, lo, hi); - buf[fromIndex * 2] = (char) lo; - buf[fromIndex * 2 + 1] = (char) hi; - if (toIndex < size()) { - System.arraycopy(buf, toIndex * 2, buf, fromIndex * 2 + 2, length - (toIndex * 2)); - } - length -= (toIndex - (fromIndex + 1)) * 2; - } - - @Override - public void appendRangesTo(RangesBuffer buffer, int startIndex, int endIndex) { - assert buffer instanceof CharRangesBuffer; - int bulkLength = (endIndex - startIndex) * 2; - if (bulkLength == 0) { - return; - } - CharRangesBuffer o = (CharRangesBuffer) buffer; - int newSize = o.length() + bulkLength; - o.ensureCapacity(newSize); - assert o.isEmpty() || rightOf(startIndex, o, o.size() - 1); - System.arraycopy(buf, startIndex * 2, o.getBuffer(), o.length(), bulkLength); - o.setLength(newSize); - } - - @SuppressWarnings("unchecked") - @Override - public CharRangesBuffer create() { - return new CharRangesBuffer(buf.length); - } - - @TruffleBoundary - @Override - public String toString() { - return defaultToString(); - } -} diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/CompilationBuffer.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/CompilationBuffer.java index eddecd377e7d..e72fa6f4018e 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/CompilationBuffer.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/CompilationBuffer.java @@ -43,6 +43,7 @@ import com.oracle.truffle.regex.RegexSource; import com.oracle.truffle.regex.charset.CodePointSetAccumulator; import com.oracle.truffle.regex.tregex.TRegexCompiler; +import com.oracle.truffle.regex.tregex.string.Encodings.Encoding; import com.oracle.truffle.regex.util.CompilationFinalBitSet; /** @@ -56,14 +57,15 @@ * @see ObjectArrayBuffer * @see ByteArrayBuffer * @see ShortArrayBuffer - * @see CharRangesBuffer */ public class CompilationBuffer { + private final Encoding encoding; private ObjectArrayBuffer objectBuffer1; private ObjectArrayBuffer objectBuffer2; private ByteArrayBuffer byteArrayBuffer; - private ShortArrayBuffer shortArrayBuffer; + private ShortArrayBuffer shortArrayBuffer1; + private ShortArrayBuffer shortArrayBuffer2; private IntRangesBuffer intRangesBuffer1; private IntRangesBuffer intRangesBuffer2; private IntRangesBuffer intRangesBuffer3; @@ -71,6 +73,14 @@ public class CompilationBuffer { private CodePointSetAccumulator codePointSetAccumulator2; private CompilationFinalBitSet byteSizeBitSet; + public CompilationBuffer(Encoding encoding) { + this.encoding = encoding; + } + + public Encoding getEncoding() { + return encoding; + } + @SuppressWarnings("unchecked") public ObjectArrayBuffer getObjectBuffer1() { if (objectBuffer1 == null) { @@ -97,12 +107,20 @@ public ByteArrayBuffer getByteArrayBuffer() { return byteArrayBuffer; } - public ShortArrayBuffer getShortArrayBuffer() { - if (shortArrayBuffer == null) { - shortArrayBuffer = new ShortArrayBuffer(); + public ShortArrayBuffer getShortArrayBuffer1() { + if (shortArrayBuffer1 == null) { + shortArrayBuffer1 = new ShortArrayBuffer(); + } + shortArrayBuffer1.clear(); + return shortArrayBuffer1; + } + + public ShortArrayBuffer getShortArrayBuffer2() { + if (shortArrayBuffer2 == null) { + shortArrayBuffer2 = new ShortArrayBuffer(); } - shortArrayBuffer.clear(); - return shortArrayBuffer; + shortArrayBuffer2.clear(); + return shortArrayBuffer2; } public IntRangesBuffer getIntRangesBuffer1() { diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/IntArrayBuffer.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/IntArrayBuffer.java index ed568b697682..95d2f11ad9a1 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/IntArrayBuffer.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/IntArrayBuffer.java @@ -109,6 +109,11 @@ public int get(int index) { return buf[index]; } + public void inc(int index) { + assert index < length; + buf[index]++; + } + public void set(int index, int value) { assert index < length; buf[index] = value; diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/IntRangesBuffer.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/IntRangesBuffer.java index f00f5be65920..933e09a9e232 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/IntRangesBuffer.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/IntRangesBuffer.java @@ -44,7 +44,6 @@ import java.util.Iterator; import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; -import com.oracle.truffle.regex.charset.CodePointSet; import com.oracle.truffle.regex.charset.Range; import com.oracle.truffle.regex.charset.RangesBuffer; @@ -71,16 +70,6 @@ public IntRangesBuffer(int initialSize) { super(initialSize); } - @Override - public int getMinValue() { - return CodePointSet.MIN_VALUE; - } - - @Override - public int getMaxValue() { - return CodePointSet.MAX_VALUE; - } - @Override public int getLo(int i) { return buf[i * 2]; diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/ObjectArrayBuffer.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/ObjectArrayBuffer.java index 7d835b74b929..a6ed70f23bdc 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/ObjectArrayBuffer.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/ObjectArrayBuffer.java @@ -93,6 +93,10 @@ public T get(int i) { return (T) buf[i]; } + public void set(int i, T obj) { + buf[i] = obj; + } + public void add(T o) { if (length == buf.length) { grow(length * 2); @@ -126,6 +130,13 @@ public Object peek() { return buf[length - 1]; } + public ObjectArrayBuffer asFixedSizeArray(int size) { + ensureCapacity(size); + Arrays.fill(buf, null); + length = size; + return this; + } + @SuppressWarnings("unchecked") public void sort(Comparator comparator) { Arrays.sort((T[]) buf, 0, length, comparator); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFACaptureGroupTransitionBuilder.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFACaptureGroupTransitionBuilder.java index 6fc562b3f0cf..302ab40cc6eb 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFACaptureGroupTransitionBuilder.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFACaptureGroupTransitionBuilder.java @@ -80,7 +80,7 @@ public class DFACaptureGroupTransitionBuilder extends DFAStateTransitionBuilder @Override public DFAStateTransitionBuilder createNodeSplitCopy() { - return new DFACaptureGroupTransitionBuilder(getMatcherBuilder(), getTransitionSet(), dfaGen); + return new DFACaptureGroupTransitionBuilder(getCodePointSet(), getTransitionSet(), dfaGen); } public void setLazyTransition(DFACaptureGroupLazyTransition lazyTransition) { diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAGenerator.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAGenerator.java index 09820b500fa0..af009fac3120 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAGenerator.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAGenerator.java @@ -46,7 +46,6 @@ import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Objects; @@ -56,11 +55,10 @@ import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; import com.oracle.truffle.regex.RegexOptions; import com.oracle.truffle.regex.UnsupportedRegexException; -import com.oracle.truffle.regex.charset.CharMatchers; import com.oracle.truffle.regex.charset.CodePointSet; import com.oracle.truffle.regex.charset.CodePointSetAccumulator; +import com.oracle.truffle.regex.charset.CompressedCodePointSet; import com.oracle.truffle.regex.charset.Constants; -import com.oracle.truffle.regex.charset.Range; import com.oracle.truffle.regex.result.PreCalculatedResultFactory; import com.oracle.truffle.regex.tregex.TRegexCompilationRequest; import com.oracle.truffle.regex.tregex.TRegexOptions; @@ -71,8 +69,6 @@ import com.oracle.truffle.regex.tregex.buffer.IntArrayBuffer; import com.oracle.truffle.regex.tregex.buffer.ObjectArrayBuffer; import com.oracle.truffle.regex.tregex.buffer.ShortArrayBuffer; -import com.oracle.truffle.regex.tregex.matchers.AnyMatcher; -import com.oracle.truffle.regex.tregex.matchers.CharMatcher; import com.oracle.truffle.regex.tregex.nfa.NFA; import com.oracle.truffle.regex.tregex.nfa.NFAState; import com.oracle.truffle.regex.tregex.nfa.NFAStateTransition; @@ -88,6 +84,7 @@ import com.oracle.truffle.regex.tregex.nodes.dfa.DFASimpleCG; import com.oracle.truffle.regex.tregex.nodes.dfa.DFASimpleCGTransition; import com.oracle.truffle.regex.tregex.nodes.dfa.DFAStateNode; +import com.oracle.truffle.regex.tregex.nodes.dfa.Matchers; import com.oracle.truffle.regex.tregex.nodes.dfa.TRegexDFAExecutorDebugRecorder; import com.oracle.truffle.regex.tregex.nodes.dfa.TRegexDFAExecutorNode; import com.oracle.truffle.regex.tregex.nodes.dfa.TRegexDFAExecutorProperties; @@ -102,10 +99,12 @@ import com.oracle.truffle.regex.tregex.parser.ast.RegexASTNode; import com.oracle.truffle.regex.tregex.parser.ast.Sequence; import com.oracle.truffle.regex.tregex.parser.ast.visitors.AddToSetVisitor; +import com.oracle.truffle.regex.tregex.string.Encodings.Encoding; import com.oracle.truffle.regex.tregex.util.MathUtil; import com.oracle.truffle.regex.tregex.util.json.Json; import com.oracle.truffle.regex.tregex.util.json.JsonConvertible; import com.oracle.truffle.regex.tregex.util.json.JsonValue; +import com.oracle.truffle.regex.util.BitSets; import com.oracle.truffle.regex.util.CompilationFinalBitSet; public final class DFAGenerator implements JsonConvertible { @@ -144,6 +143,8 @@ public final class DFAGenerator implements JsonConvertible { private List bfsTraversalNext; private EconomicMap stateReplacements; + private final Matchers.Builder matchersBuilder; + public DFAGenerator(TRegexCompilationRequest compilationReqest, NFA nfa, TRegexDFAExecutorProperties executorProps, CompilationBuffer compilationBuffer, RegexOptions engineOptions) { this.compilationReqest = compilationReqest; this.nfa = nfa; @@ -163,6 +164,7 @@ public DFAGenerator(TRegexCompilationRequest compilationReqest, NFA nfa, TRegexD } assert !nfa.isDead(); this.canonicalizer = new DFATransitionCanonicalizer(this); + this.matchersBuilder = nfa.getAst().getEncoding().createMatchersBuilder(); } public NFA getNfa() { @@ -173,6 +175,10 @@ public DFAStateNodeBuilder[] getEntryStates() { return entryStates; } + private DFAStateNodeBuilder getUnanchoredInitialState() { + return entryStates[nfa.getAnchoredEntry().length]; + } + public Map getStateMap() { return stateMap; } @@ -201,6 +207,10 @@ public RegexOptions getEngineOptions() { return engineOptions; } + private Encoding getEncoding() { + return nfa.getAst().getEncoding(); + } + private DFAStateNodeBuilder[] getStateIndexMap() { if (stateIndexMap == null) { createStateIndexMap(nextID); @@ -408,7 +418,7 @@ private void expandState(DFAStateNodeBuilder state) { } } DFAStateTransitionBuilder[] transitions = canonicalizer.run(compilationBuffer); - Arrays.sort(transitions, Comparator.comparing(TransitionBuilder::getMatcherBuilder)); + Arrays.sort(transitions, Comparator.comparing(TransitionBuilder::getCodePointSet)); for (DFAStateTransitionBuilder transition : transitions) { assert !transition.getTransitionSet().isEmpty(); transition.setId(transitionIDCounter.inc()); @@ -606,7 +616,7 @@ private void optimizeDFA() { // find the literal's beginning and end in the DFA DFAStateNodeBuilder literalFirstDFAState = null; DFAStateNodeBuilder literalLastDFAState = null; - DFAStateNodeBuilder unanchoredInitialState = entryStates[nfa.getAnchoredEntry().length]; + DFAStateNodeBuilder unanchoredInitialState = getUnanchoredInitialState(); CompilationFinalBitSet visited = new CompilationFinalBitSet(nextID); visited.set(unanchoredInitialState.getId()); bfsTraversalCur.clear(); @@ -689,10 +699,10 @@ private void optimizeDFA() { } else if (newTransitions == null) { newTransitions = compilationBuffer.getObjectBuffer1(); newTransitions.addAll(s.getSuccessors(), 0, i); - acc.addSet(mergedTransition.getMatcherBuilder()); - acc.addSet(t.getMatcherBuilder()); + acc.addSet(mergedTransition.getCodePointSet()); + acc.addSet(t.getCodePointSet()); } else { - acc.addSet(t.getMatcherBuilder()); + acc.addSet(t.getCodePointSet()); } } else if (newTransitions != null) { newTransitions.add(t); @@ -703,7 +713,7 @@ private void optimizeDFA() { if (newTransitions != null && mergedTransition != null) { mergedTransition.setMatcherBuilder(acc.toCodePointSet()); s.setSuccessors(newTransitions.toArray(new DFAStateTransitionBuilder[newTransitions.length()])); - Arrays.sort(s.getSuccessors(), Comparator.comparing(TransitionBuilder::getMatcherBuilder)); + Arrays.sort(s.getSuccessors(), Comparator.comparing(TransitionBuilder::getCodePointSet)); } } } @@ -780,8 +790,10 @@ private DFAAbstractStateNode[] createDFAExecutorStates() { } } } + boolean utf16MustDecode = false; DFAAbstractStateNode[] ret = new DFAAbstractStateNode[stateMap.values().size() + 1]; for (DFAStateNodeBuilder s : stateMap.values()) { + matchersBuilder.reset(s.getSuccessors().length); assert s.getId() <= Short.MAX_VALUE; short id = (short) s.getId(); DFAAbstractStateNode replacement = getReplacement(id); @@ -789,23 +801,24 @@ private DFAAbstractStateNode[] createDFAExecutorStates() { ret[id] = replacement; continue; } - CharMatcher[] matchers = (s.getSuccessors().length > 0) ? new CharMatcher[s.getSuccessors().length] : CharMatcher.EMPTY; - DFASimpleCGTransition[] simpleCGTransitions = doSimpleCG ? new DFASimpleCGTransition[matchers.length] : null; + DFASimpleCGTransition[] simpleCGTransitions = doSimpleCG ? new DFASimpleCGTransition[s.getSuccessors().length] : null; int nRanges = 0; int estimatedTransitionsCost = 0; boolean coversCharSpace = s.coversFullCharSpace(compilationBuffer); - for (int i = 0; i < matchers.length; i++) { + for (int i = 0; i < s.getSuccessors().length; i++) { DFAStateTransitionBuilder t = s.getSuccessors()[i]; - CodePointSet matcherBuilder = t.getMatcherBuilder(); - if (i == matchers.length - 1 && (coversCharSpace || (pruneUnambiguousPaths && !s.isFinalStateSuccessor()))) { + CodePointSet cps = t.getCodePointSet(); + utf16MustDecode |= Constants.ASTRAL_SYMBOLS_AND_LONE_SURROGATES.intersects(cps); + if (i == s.getSuccessors().length - 1 && (coversCharSpace || (pruneUnambiguousPaths && !s.isFinalStateSuccessor()))) { // replace the last matcher with an AnyMatcher, since it must always cover the // remaining input space - matchers[i] = AnyMatcher.create(); + matchersBuilder.setNoMatchSuccessor((short) i); } else { - nRanges += matcherBuilder.size(); - matchers[i] = CharMatchers.createMatcher(matcherBuilder, nfa.getAst().getEncoding(), compilationBuffer); + nRanges += cps.size(); + getEncoding().createMatcher(matchersBuilder, i, cps, compilationBuffer); + } - estimatedTransitionsCost += matchers[i].estimatedCost(); + estimatedTransitionsCost += matchersBuilder.estimatedCost(i); if (doSimpleCG) { assert t.getTransitionSet().size() <= 2; @@ -814,22 +827,26 @@ private DFAAbstractStateNode[] createDFAExecutorStates() { } } + Matchers matchers = null; // Very conservative heuristic for whether we should use AllTransitionsInOneTreeMatcher. // TODO: Potential benefits of this should be further explored. AllTransitionsInOneTreeMatcher allTransitionsInOneTreeMatcher = null; boolean useTreeTransitionMatcher = nRanges > 1 && MathUtil.log2ceil(nRanges + 2) * 8 < estimatedTransitionsCost; if (useTreeTransitionMatcher) { - if (!getOptions().isRegressionTestMode()) { + if (getOptions().isRegressionTestMode()) { // in regression test mode, we compare results of regular matchers and // AllTransitionsInOneTreeMatcher - matchers = null; + matchers = getEncoding().toMatchers(matchersBuilder); } - allTransitionsInOneTreeMatcher = createAllTransitionsInOneTreeMatcher(s); + allTransitionsInOneTreeMatcher = createAllTransitionsInOneTreeMatcher(s, coversCharSpace); + } else { + matchers = getEncoding().toMatchers(matchersBuilder); } short[] successors = s.getNumberOfSuccessors() > 0 ? new short[s.getNumberOfSuccessors()] : EMPTY_SHORT_ARRAY; short[] cgTransitions = null; short[] cgPrecedingTransitions = null; + DFAStateNode.LoopOptimizationNode loopOptimizationNode = null; if (isGenericCG()) { cgTransitions = new short[s.getSuccessors().length]; DFAStateTransitionBuilder[] precedingTransitions = s.getPredecessors(); @@ -839,22 +856,14 @@ private DFAAbstractStateNode[] createDFAExecutorStates() { cgPrecedingTransitions[i] = ((DFACaptureGroupTransitionBuilder) precedingTransitions[i]).toLazyTransition(compilationBuffer).getId(); } } - char[] indexOfChars = null; short loopToSelf = -1; for (int i = 0; i < successors.length - (s.hasBackwardPrefixState() ? 1 : 0); i++) { successors[i] = (short) s.getSuccessors()[i].getTarget().getId(); if (successors[i] == id) { loopToSelf = (short) i; - CodePointSet loopMB = s.getSuccessors()[i].getMatcherBuilder(); - // TODO: specialized for UTF-16. Generalize for other encodings! - if (coversCharSpace && !loopMB.matchesEverything() && loopMB.inverseValueCount() <= 4 && loopMB.inverseGetMax() <= 0xffff) { - indexOfChars = loopMB.inverseToCharArray(); - for (char c : indexOfChars) { - if (Constants.SURROGATES.contains(c)) { - indexOfChars = null; - break; - } - } + CodePointSet loopMB = s.getSuccessors()[i].getCodePointSet(); + if (coversCharSpace && !loopMB.matchesEverything(getEncoding()) && loopMB.inverseValueCount(getEncoding()) <= 4) { + loopOptimizationNode = getEncoding().extractLoopOptNode(loopMB); } } assert successors[i] >= 0 && successors[i] < ret.length; @@ -867,11 +876,7 @@ private DFAAbstractStateNode[] createDFAExecutorStates() { if (s.hasBackwardPrefixState()) { successors[successors.length - 1] = s.getBackwardPrefixState(); } - byte flags = DFAStateNode.buildFlags(s.isUnAnchoredFinalState(), s.isAnchoredFinalState(), s.hasBackwardPrefixState()); - DFAStateNode.LoopOptimizationNode loopOptimizationNode = null; - if (loopToSelf != -1) { - loopOptimizationNode = DFAStateNode.buildLoopOptimizationNode(loopToSelf, indexOfChars); - } + byte flags = DFAStateNode.buildFlags(s.isUnAnchoredFinalState(), s.isAnchoredFinalState(), s.hasBackwardPrefixState(), utf16MustDecode); DFASimpleCG simpleCG = null; if (doSimpleCG) { simpleCG = DFASimpleCG.create(simpleCGTransitions, @@ -880,16 +885,16 @@ private DFAAbstractStateNode[] createDFAExecutorStates() { } DFAStateNode stateNode; if (isGenericCG()) { - stateNode = new CGTrackingDFAStateNode(id, flags, loopOptimizationNode, successors, matchers, allTransitionsInOneTreeMatcher, cgTransitions, cgPrecedingTransitions, + stateNode = new CGTrackingDFAStateNode(id, flags, loopToSelf, loopOptimizationNode, successors, matchers, allTransitionsInOneTreeMatcher, cgTransitions, cgPrecedingTransitions, createCGFinalTransition(s.getAnchoredFinalStateTransition()), createCGFinalTransition(s.getUnAnchoredFinalStateTransition())); } else if (nfa.isTraceFinderNFA()) { - stateNode = new TraceFinderDFAStateNode(id, flags, loopOptimizationNode, successors, matchers, + stateNode = new TraceFinderDFAStateNode(id, flags, loopToSelf, loopOptimizationNode, successors, matchers, allTransitionsInOneTreeMatcher, s.getPreCalculatedUnAnchoredResult(), s.getPreCalculatedAnchoredResult()); } else if (isForward()) { - stateNode = new DFAStateNode(id, flags, loopOptimizationNode, successors, matchers, simpleCG, allTransitionsInOneTreeMatcher); + stateNode = new DFAStateNode(id, flags, loopToSelf, loopOptimizationNode, successors, matchers, simpleCG, allTransitionsInOneTreeMatcher); } else { - stateNode = new BackwardDFAStateNode(id, flags, loopOptimizationNode, successors, matchers, simpleCG, allTransitionsInOneTreeMatcher); + stateNode = new BackwardDFAStateNode(id, flags, loopToSelf, loopOptimizationNode, successors, matchers, simpleCG, allTransitionsInOneTreeMatcher); } ret[id] = stateNode; } @@ -900,49 +905,96 @@ private DFASimpleCGTransition createSimpleCGTransition(NFAStateTransition nfaTra return DFASimpleCGTransition.create(nfaTransition, isForward() && nfaTransition != null && nfaTransition.getSource() == nfa.getInitialLoopBackTransition().getSource()); } - private AllTransitionsInOneTreeMatcher createAllTransitionsInOneTreeMatcher(DFAStateNodeBuilder state) { + private AllTransitionsInOneTreeMatcher createAllTransitionsInOneTreeMatcher(DFAStateNodeBuilder state, boolean coversCharSpace) { DFAStateTransitionBuilder[] transitions = state.getSuccessors(); - IntArrayBuffer sortedRangesBuf = compilationBuffer.getIntRangesBuffer1(); - ShortArrayBuffer rangeTreeSuccessorsBuf = compilationBuffer.getShortArrayBuffer(); - @SuppressWarnings("unchecked") - Iterator[] iterators = new Iterator[transitions.length]; - Range[] curRanges = new Range[transitions.length]; - boolean rangesLeft = false; - for (int i = 0; i < transitions.length; i++) { - iterators[i] = transitions[i].getMatcherBuilder().iterator(); - if (iterators[i].hasNext()) { - curRanges[i] = iterators[i].next(); - rangesLeft = true; - } + CompressedCodePointSet[] ccpss = new CompressedCodePointSet[coversCharSpace ? transitions.length - 1 : transitions.length]; + for (int i = 0; i < ccpss.length; i++) { + ccpss[i] = CompressedCodePointSet.create(transitions[i].getCodePointSet(), compilationBuffer); } + IntArrayBuffer ranges = compilationBuffer.getIntRangesBuffer1(); + IntArrayBuffer iterators = compilationBuffer.getIntRangesBuffer2().asFixedSizeArray(ccpss.length, 0); + IntArrayBuffer byteRanges = compilationBuffer.getIntRangesBuffer3(); + ShortArrayBuffer successors = compilationBuffer.getShortArrayBuffer1(); + ShortArrayBuffer byteSuccessors = compilationBuffer.getShortArrayBuffer2(); + ObjectArrayBuffer byteBitSets = compilationBuffer.getObjectBuffer1(); + ObjectArrayBuffer byteMatchers = compilationBuffer.getObjectBuffer2(); + short noMatchSuccessor = (short) (coversCharSpace ? transitions.length - 1 : -1); int lastHi = 0; - while (rangesLeft) { + while (true) { int minLo = Integer.MAX_VALUE; - int minMb = -1; - for (int i = 0; i < transitions.length; i++) { - if (curRanges[i] != null && curRanges[i].lo < minLo) { - minLo = curRanges[i].lo; - minMb = i; + int minCPS = -1; + for (int i = 0; i < ccpss.length; i++) { + if (iterators.get(i) < ccpss[i].size() && ccpss[i].getLo(iterators.get(i)) < minLo) { + minLo = ccpss[i].getLo(iterators.get(i)); + minCPS = i; } } - if (minMb == -1) { + if (minCPS == -1) { break; } if (minLo != lastHi) { - rangeTreeSuccessorsBuf.add((short) -1); - sortedRangesBuf.add(minLo); + successors.add(noMatchSuccessor); + ranges.add(minLo); + } + lastHi = ccpss[minCPS].getHi(iterators.get(minCPS)) + 1; + + if (ccpss[minCPS].hasBitSet(iterators.get(minCPS))) { + byteRanges.clear(); + byteSuccessors.clear(); + byteBitSets.clear(); + for (int i = 0; i < ccpss.length; i++) { + if (iterators.get(i) < ccpss[i].size() && ccpss[i].hasBitSet(iterators.get(i)) && BitSets.highByte(ccpss[i].getLo(iterators.get(i))) == BitSets.highByte(lastHi - 1)) { + byteBitSets.add(ccpss[i].getBitSet(iterators.get(i))); + lastHi = Math.max(lastHi, ccpss[i].getHi(iterators.get(i)) + 1); + iterators.inc(i); + byteSuccessors.add((short) i); + } + } + int byteLastHi = minLo; + while (true) { + int byteMinLo = lastHi; + int byteMinCPS = -1; + for (int i = 0; i < ccpss.length; i++) { + if (iterators.get(i) < ccpss[i].size() && ccpss[i].getLo(iterators.get(i)) < byteMinLo) { + assert !ccpss[i].hasBitSet(iterators.get(i)); + assert ccpss[i].getHi(iterators.get(i)) < lastHi; + byteMinLo = ccpss[i].getLo(iterators.get(i)); + byteMinCPS = i; + } + } + if (byteMinCPS == -1) { + break; + } + if (byteMinLo != byteLastHi) { + byteSuccessors.add(noMatchSuccessor); + byteRanges.add(byteMinLo); + } + byteSuccessors.add((short) byteMinCPS); + byteLastHi = ccpss[byteMinCPS].getHi(iterators.get(byteMinCPS)) + 1; + if (byteLastHi < lastHi) { + byteRanges.add(byteLastHi); + } + iterators.inc(byteMinCPS); + } + if (byteLastHi != lastHi) { + byteSuccessors.add(noMatchSuccessor); + } + successors.add((short) ((byteMatchers.length() + 2) * -1)); + byteMatchers.add(new AllTransitionsInOneTreeMatcher.AllTransitionsInOneTreeLeafMatcher( + byteBitSets.toArray(new long[byteBitSets.length()][]), byteSuccessors.toArray(), byteRanges.toArray())); + } else { + successors.add((short) minCPS); + iterators.inc(minCPS); } - rangeTreeSuccessorsBuf.add((short) minMb); - lastHi = curRanges[minMb].hi + 1; - if (lastHi <= Constants.MAX_CODE_POINT) { - sortedRangesBuf.add(lastHi); + if (lastHi <= getEncoding().getMaxValue()) { + ranges.add(lastHi); } - curRanges[minMb] = iterators[minMb].hasNext() ? iterators[minMb].next() : null; } - if (lastHi != Constants.MAX_CODE_POINT + 1) { - rangeTreeSuccessorsBuf.add((short) -1); + if (lastHi != getEncoding().getMaxValue() + 1) { + successors.add(noMatchSuccessor); } - return new AllTransitionsInOneTreeMatcher(sortedRangesBuf.toArray(), rangeTreeSuccessorsBuf.toArray()); + return new AllTransitionsInOneTreeMatcher(ranges.toArray(), successors.toArray(), + byteMatchers.toArray(new AllTransitionsInOneTreeMatcher.AllTransitionsInOneTreeLeafMatcher[byteMatchers.length()])); } private void registerCGTransition(DFACaptureGroupLazyTransition cgTransition) { diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAStateNodeBuilder.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAStateNodeBuilder.java index 6106ff20eb54..708b3cfb4d05 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAStateNodeBuilder.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAStateNodeBuilder.java @@ -52,6 +52,7 @@ import com.oracle.truffle.regex.tregex.nfa.NFAState; import com.oracle.truffle.regex.tregex.nfa.NFAStateTransition; import com.oracle.truffle.regex.tregex.nodes.dfa.TraceFinderDFAStateNode; +import com.oracle.truffle.regex.tregex.string.Encodings.Encoding; import com.oracle.truffle.regex.tregex.util.DebugUtil; import com.oracle.truffle.regex.tregex.util.json.Json; import com.oracle.truffle.regex.tregex.util.json.JsonConvertible; @@ -180,23 +181,22 @@ protected DFAStateTransitionBuilder[] createTransitionsArray(int length) { } /** - * Returns {@code true} iff the union of the - * {@link DFAStateTransitionBuilder#getMatcherBuilder()} of all transitions in this state is - * equal to {@link CodePointSet#getFull()}. + * Returns {@code true} iff the union of the {@link DFAStateTransitionBuilder#getCodePointSet()} + * of all transitions in this state is equal to {@link Encoding#getFullSet()}. */ public boolean coversFullCharSpace(CompilationBuffer compilationBuffer) { IntArrayBuffer indicesBuf = compilationBuffer.getIntRangesBuffer1(); indicesBuf.ensureCapacity(getSuccessors().length); int[] indices = indicesBuf.getBuffer(); Arrays.fill(indices, 0, getSuccessors().length, 0); - int nextLo = CodePointSet.MIN_VALUE; + int nextLo = compilationBuffer.getEncoding().getMinValue(); while (true) { int i = findNextLo(indices, nextLo); if (i < 0) { return false; } - CodePointSet ranges = getSuccessors()[i].getMatcherBuilder(); - if (ranges.getHi(indices[i]) == CodePointSet.MAX_VALUE) { + CodePointSet ranges = getSuccessors()[i].getCodePointSet(); + if (ranges.getHi(indices[i]) == compilationBuffer.getEncoding().getMaxValue()) { return true; } nextLo = ranges.getHi(indices[i]) + 1; @@ -206,7 +206,7 @@ public boolean coversFullCharSpace(CompilationBuffer compilationBuffer) { private int findNextLo(int[] indices, int findLo) { for (int i = 0; i < getSuccessors().length; i++) { - CodePointSet ranges = getSuccessors()[i].getMatcherBuilder(); + CodePointSet ranges = getSuccessors()[i].getCodePointSet(); if (indices[i] == ranges.size()) { continue; } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAStateTransitionBuilder.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAStateTransitionBuilder.java index 76cad7f07359..188247fee70d 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAStateTransitionBuilder.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAStateTransitionBuilder.java @@ -71,7 +71,7 @@ public DFAStateTransitionBuilder(TransitionSet " + target; + return source + " -" + getCodePointSet() + "-> " + target; } @TruffleBoundary @@ -120,7 +120,7 @@ public JsonValue toJson() { return Json.obj(Json.prop("id", id), Json.prop("source", source.getId()), Json.prop("target", target.getId()), - Json.prop("matcherBuilder", getMatcherBuilder().toString()), + Json.prop("matcherBuilder", getCodePointSet().toString()), Json.prop("nfaTransitions", nfaTransitions)); } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/BitSetMatcher.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/BitSetMatcher.java index c45ad2e5ec82..699c17bff44e 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/BitSetMatcher.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/BitSetMatcher.java @@ -41,9 +41,10 @@ package com.oracle.truffle.regex.tregex.matchers; import com.oracle.truffle.api.CompilerDirectives; +import com.oracle.truffle.api.CompilerDirectives.CompilationFinal; import com.oracle.truffle.api.dsl.Specialization; import com.oracle.truffle.regex.tregex.util.DebugUtil; -import com.oracle.truffle.regex.util.CompilationFinalBitSet; +import com.oracle.truffle.regex.util.BitSets; /** * Matcher that matches multiple characters with a common high byte using a bit set.
    @@ -54,9 +55,9 @@ public abstract class BitSetMatcher extends InvertibleCharMatcher { private final int highByte; - private final CompilationFinalBitSet bitSet; + @CompilationFinal(dimensions = 1) private final long[] bitSet; - BitSetMatcher(boolean invert, int highByte, CompilationFinalBitSet bitSet) { + BitSetMatcher(boolean invert, int highByte, long[] bitSet) { super(invert); assert highByte != 0 : "use NullHighByteBitSetMatcher instead!"; this.highByte = highByte; @@ -71,20 +72,20 @@ public abstract class BitSetMatcher extends InvertibleCharMatcher { * @param bitSet the bit set to match the low byte of the characters to match. * @return a new {@link BitSetMatcher} or a {@link NullHighByteBitSetMatcher}. */ - public static InvertibleCharMatcher create(boolean invert, int highByte, CompilationFinalBitSet bitSet) { + public static InvertibleCharMatcher create(boolean invert, int highByte, long[] bitSet) { if (highByte == 0) { return NullHighByteBitSetMatcher.create(invert, bitSet); } return BitSetMatcherNodeGen.create(invert, highByte, bitSet); } - public CompilationFinalBitSet getBitSet() { + public long[] getBitSet() { return bitSet; } @Specialization public boolean match(int c, boolean compactString) { - return result(!compactString && highByte(c) == highByte && bitSet.get(lowByte(c))); + return result(!compactString && highByte(c) == highByte && BitSets.get(bitSet, lowByte(c))); } @Override @@ -96,6 +97,6 @@ public int estimatedCost() { @Override @CompilerDirectives.TruffleBoundary public String toString() { - return modifiersToString() + "{hi " + DebugUtil.charToString(highByte) + " lo " + bitSet + "}"; + return modifiersToString() + "{hi " + DebugUtil.charToString(highByte) + " lo " + BitSets.toString(bitSet) + "}"; } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/HybridBitSetMatcher.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/HybridBitSetMatcher.java index 2da739151cb3..c0e984fd21ae 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/HybridBitSetMatcher.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/HybridBitSetMatcher.java @@ -45,10 +45,11 @@ import com.oracle.truffle.api.CompilerDirectives.CompilationFinal; import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; import com.oracle.truffle.api.dsl.Specialization; +import com.oracle.truffle.regex.charset.CompressedCodePointSet; import com.oracle.truffle.regex.charset.Range; import com.oracle.truffle.regex.tregex.TRegexOptions; import com.oracle.truffle.regex.tregex.util.MathUtil; -import com.oracle.truffle.regex.util.CompilationFinalBitSet; +import com.oracle.truffle.regex.util.BitSets; /** * Character matcher that compiles to a binary search in a sorted list of ranges, like @@ -68,11 +69,13 @@ * - ranges: [0x02-0x06, 0x1000, 0x1020-0x1030] * - bit-sets: [[0x02, 0x04, 0x06], null, null ] * + * + * @see CompressedCodePointSet */ public abstract class HybridBitSetMatcher extends InvertibleCharMatcher { @CompilationFinal(dimensions = 1) private final int[] sortedRanges; - @CompilationFinal(dimensions = 1) private final CompilationFinalBitSet[] bitSets; + @CompilationFinal(dimensions = 2) private final long[][] bitSets; /** * Constructs a new {@link HybridBitSetMatcher}. @@ -81,15 +84,15 @@ public abstract class HybridBitSetMatcher extends InvertibleCharMatcher { * @param bitSets the bit sets that match the low bytes if the character under inspection has * the corresponding high byte. */ - HybridBitSetMatcher(boolean invert, int[] sortedRanges, CompilationFinalBitSet[] bitSets) { + HybridBitSetMatcher(boolean invert, int[] sortedRanges, long[][] bitSets) { super(invert); this.sortedRanges = sortedRanges; this.bitSets = bitSets; assert bitSets.length == sortedRanges.length / 2; } - public static HybridBitSetMatcher create(boolean invert, int[] sortedRanges, CompilationFinalBitSet[] bitSets) { - return HybridBitSetMatcherNodeGen.create(invert, sortedRanges, bitSets); + public static HybridBitSetMatcher create(boolean invert, CompressedCodePointSet ccps) { + return HybridBitSetMatcherNodeGen.create(invert, ccps.getRanges(), ccps.getBitSets()); } @Specialization @@ -112,7 +115,7 @@ private boolean matchTree(int fromIndex, int toIndex, int c) { } else if (c > sortedRanges[(mid << 1) + 1]) { return matchTree(mid + 1, toIndex, c); } else { - return result(bitSets[mid] == null || bitSets[mid].get(lowByte(c))); + return result(bitSets[mid] == null || BitSets.get(bitSets[mid], lowByte(c))); } } @@ -129,7 +132,7 @@ public String toString() { if (bitSets[i / 2] == null) { sb.append(Range.toString(sortedRanges[i], sortedRanges[i + 1])); } else { - sb.append("[range: ").append(Range.toString(sortedRanges[i], sortedRanges[i + 1])).append(", bs: ").append(bitSets[i / 2]).append("]"); + sb.append("[range: ").append(Range.toString(sortedRanges[i], sortedRanges[i + 1])).append(", bs: ").append(BitSets.toString(bitSets[i / 2])).append("]"); } } return sb.append("]").toString(); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/MultiBitSetMatcher.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/MultiBitSetMatcher.java index 7bd73f074c5a..caab7ee867c7 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/MultiBitSetMatcher.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/MultiBitSetMatcher.java @@ -47,7 +47,7 @@ import com.oracle.truffle.api.dsl.Specialization; import com.oracle.truffle.regex.charset.ImmutableSortedListOfIntRanges; import com.oracle.truffle.regex.charset.Range; -import com.oracle.truffle.regex.util.CompilationFinalBitSet; +import com.oracle.truffle.regex.util.BitSets; /** * Character matcher that uses an array of 256 bit sets to fully cover the 16 bit character space. @@ -60,17 +60,13 @@ public abstract class MultiBitSetMatcher extends InvertibleCharMatcher { private static final int BYTE_MAX_VALUE = 255; private static final int BYTE_MIN_VALUE = 0; - private static final CompilationFinalBitSet MATCH_NONE = new CompilationFinalBitSet(BYTE_RANGE); - private static final CompilationFinalBitSet MATCH_ALL = new CompilationFinalBitSet(BYTE_RANGE); - - static { - MATCH_ALL.invert(); - } + private static final long[] MATCH_NONE = {0L, 0L, 0L, 0L}; + private static final long[] MATCH_ALL = {~0L, ~0L, ~0L, ~0L}; public static MultiBitSetMatcher fromRanges(boolean inverse, ImmutableSortedListOfIntRanges cps) { - CompilationFinalBitSet[] bitSets = new CompilationFinalBitSet[BYTE_RANGE]; + long[][] bitSets = new long[BYTE_RANGE][]; Arrays.fill(bitSets, MATCH_NONE); - CompilationFinalBitSet cur = new CompilationFinalBitSet(BYTE_RANGE); + long[] cur = new long[4]; int curByte = -1; for (Range r : cps) { if (curByte == -1) { @@ -78,36 +74,36 @@ public static MultiBitSetMatcher fromRanges(boolean inverse, ImmutableSortedList } if (highByte(r.lo) > curByte) { bitSets[curByte] = cur; - cur = new CompilationFinalBitSet(BYTE_RANGE); + cur = new long[4]; curByte = highByte(r.lo); } if (highByte(r.lo) == highByte(r.hi)) { - cur.setRange(lowByte(r.lo), lowByte(r.hi)); + BitSets.setRange(cur, lowByte(r.lo), lowByte(r.hi)); } else { - cur.setRange(lowByte(r.lo), BYTE_MAX_VALUE); + BitSets.setRange(cur, lowByte(r.lo), BYTE_MAX_VALUE); bitSets[curByte] = cur; for (int j = highByte(r.lo) + 1; j < highByte(r.hi); j++) { bitSets[j] = MATCH_ALL; } - cur = new CompilationFinalBitSet(BYTE_RANGE); + cur = new long[4]; curByte = highByte(r.hi); - cur.setRange(BYTE_MIN_VALUE, lowByte(r.hi)); + BitSets.setRange(cur, BYTE_MIN_VALUE, lowByte(r.hi)); } } bitSets[curByte] = cur; return MultiBitSetMatcherNodeGen.create(inverse, bitSets); } - @CompilationFinal(dimensions = 1) private final CompilationFinalBitSet[] bitSets; + @CompilationFinal(dimensions = 2) private final long[][] bitSets; - MultiBitSetMatcher(boolean invert, CompilationFinalBitSet[] bitSets) { + MultiBitSetMatcher(boolean invert, long[][] bitSets) { super(invert); this.bitSets = bitSets; } @Specialization protected boolean match(int c, boolean compactString) { - return result(bitSets[compactString ? 0 : highByte(c)].get(lowByte(c))); + return result(BitSets.get(bitSets[compactString ? 0 : highByte(c)], lowByte(c))); } @Override diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/NullHighByteBitSetMatcher.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/NullHighByteBitSetMatcher.java index 5ab8bb787d6d..81def4e7d47e 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/NullHighByteBitSetMatcher.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/NullHighByteBitSetMatcher.java @@ -41,8 +41,9 @@ package com.oracle.truffle.regex.tregex.matchers; import com.oracle.truffle.api.CompilerDirectives; +import com.oracle.truffle.api.CompilerDirectives.CompilationFinal; import com.oracle.truffle.api.dsl.Specialization; -import com.oracle.truffle.regex.util.CompilationFinalBitSet; +import com.oracle.truffle.regex.util.BitSets; /** * Specialized {@link BitSetMatcher} that exists simply because ascii bit set matchers occur often @@ -50,20 +51,20 @@ */ public abstract class NullHighByteBitSetMatcher extends InvertibleCharMatcher { - private final CompilationFinalBitSet bitSet; + @CompilationFinal(dimensions = 1) private final long[] bitSet; - NullHighByteBitSetMatcher(boolean inverse, CompilationFinalBitSet bitSet) { + NullHighByteBitSetMatcher(boolean inverse, long[] bitSet) { super(inverse); this.bitSet = bitSet; } - public static NullHighByteBitSetMatcher create(boolean inverse, CompilationFinalBitSet bitSet) { + public static NullHighByteBitSetMatcher create(boolean inverse, long[] bitSet) { return NullHighByteBitSetMatcherNodeGen.create(inverse, bitSet); } @Specialization protected boolean match(int c, @SuppressWarnings("unused") boolean compactString) { - return result(bitSet.get(c)); + return result(BitSets.get(bitSet, c)); } @Override @@ -74,6 +75,6 @@ public int estimatedCost() { @Override @CompilerDirectives.TruffleBoundary public String toString() { - return modifiersToString() + "{ascii " + bitSet + "}"; + return modifiersToString() + "{ascii " + BitSets.toString(bitSet) + "}"; } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/ProfilingCharMatcher.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/ProfilingCharMatcher.java deleted file mode 100644 index 85955401ac09..000000000000 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/matchers/ProfilingCharMatcher.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * The Universal Permissive License (UPL), Version 1.0 - * - * Subject to the condition set forth below, permission is hereby granted to any - * person obtaining a copy of this software, associated documentation and/or - * data (collectively the "Software"), free of charge and under any and all - * copyright rights in the Software, and any and all patent rights owned or - * freely licensable by each licensor hereunder covering either (i) the - * unmodified Software as contributed to or provided by such licensor, or (ii) - * the Larger Works (as defined below), to deal in both - * - * (a) the Software, and - * - * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if - * one is included with the Software each a "Larger Work" to which the Software - * is contributed by such licensors), - * - * without restriction, including without limitation the rights to copy, create - * derivative works of, display, perform, and distribute the Software and make, - * use, sell, offer for sale, import, export, have made, and have sold the - * Software and the Larger Work(s), and to sublicense the foregoing rights on - * either these or other terms. - * - * This license is subject to the following condition: - * - * The above copyright notice and either this complete permission notice or at a - * minimum a reference to the UPL must be included in all copies or substantial - * portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -package com.oracle.truffle.regex.tregex.matchers; - -import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; -import com.oracle.truffle.api.dsl.Specialization; - -public abstract class ProfilingCharMatcher extends CharMatcher { - - @Child private CharMatcher byteMatcher; - @Child private CharMatcher charMatcher; - - ProfilingCharMatcher(CharMatcher byteMatcher, CharMatcher charMatcher) { - this.byteMatcher = byteMatcher; - this.charMatcher = charMatcher; - } - - public static ProfilingCharMatcher create(CharMatcher byteMatcher, CharMatcher charMatcher) { - return ProfilingCharMatcherNodeGen.create(byteMatcher, charMatcher); - } - - @Specialization(guards = "compactString") - boolean matchCompactString(int c, boolean compactString) { - return byteMatcher.execute(c, compactString); - } - - @Specialization(guards = {"!compactString", "isByte(c)"}) - boolean matchByte(int c, boolean compactString) { - return byteMatcher.execute(c, compactString); - } - - @Specialization(guards = "!compactString", replaces = "matchByte") - boolean matchChar(int c, boolean compactString) { - return charMatcher.execute(c, compactString); - } - - static boolean isByte(int c) { - return c < 256; - } - - @Override - public int estimatedCost() { - return charMatcher.estimatedCost(); - } - - @TruffleBoundary - @Override - public String toString() { - return charMatcher.toString(); - } -} diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/ASTSuccessor.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/ASTSuccessor.java index 8c51a3f5fb4b..8a454b411e26 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/ASTSuccessor.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/ASTSuccessor.java @@ -80,8 +80,8 @@ public ASTTransition getInitialTransition() { return initialTransition; } - public CodePointSet getInitialTransitionCharSet() { - return initialTransition.getTarget() instanceof CharacterClass ? ((CharacterClass) initialTransition.getTarget()).getCharSet() : CodePointSet.getFull(); + public CodePointSet getInitialTransitionCharSet(CompilationBuffer compilationBuffer) { + return initialTransition.getTarget() instanceof CharacterClass ? ((CharacterClass) initialTransition.getTarget()).getCharSet() : compilationBuffer.getEncoding().getFullSet(); } public void setInitialTransition(ASTTransition initialTransition) { @@ -117,13 +117,13 @@ public ArrayList> getMergedStat private void mergeLookArounds(ASTTransitionCanonicalizer canonicalizer, CompilationBuffer compilationBuffer) { assert mergedStates.isEmpty(); - canonicalizer.addArgument(initialTransition, getInitialTransitionCharSet()); + canonicalizer.addArgument(initialTransition, getInitialTransitionCharSet(compilationBuffer)); for (ASTStep lookBehind : lookBehinds) { ASTSuccessor lb = lookBehind.getSuccessors().get(0); if (lookBehind.getSuccessors().size() > 1 || lb.hasLookArounds()) { throw new UnsupportedRegexException("nested look-behind assertions"); } - CodePointSet intersection = getInitialTransitionCharSet().createIntersection(lb.getInitialTransitionCharSet(), compilationBuffer); + CodePointSet intersection = getInitialTransitionCharSet(compilationBuffer).createIntersection(lb.getInitialTransitionCharSet(compilationBuffer), compilationBuffer); if (intersection.matchesSomething()) { canonicalizer.addArgument(lb.getInitialTransition(), intersection); } @@ -146,7 +146,7 @@ private void addAllIntersecting(ASTTransitionCanonicalizer canonicalizer, Transi ArrayList> result, CompilationBuffer compilationBuffer) { for (ASTSuccessor successor : lookAround.getSuccessors()) { for (TransitionBuilder lookAroundState : successor.getMergedStates(canonicalizer, compilationBuffer)) { - CodePointSet intersection = state.getMatcherBuilder().createIntersection(lookAroundState.getMatcherBuilder(), compilationBuffer); + CodePointSet intersection = state.getCodePointSet().createIntersection(lookAroundState.getCodePointSet(), compilationBuffer); if (intersection.matchesSomething()) { if (mergedTransitions == null) { mergedTransitions = new ObjectArrayBuffer<>(); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFAGenerator.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFAGenerator.java index 58ba77977b77..ee9d884358c0 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFAGenerator.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFAGenerator.java @@ -234,9 +234,9 @@ private NFAStateTransition[] createNFATransitions(NFAState sourceState, ASTStep transitionsBuffer.add(createTransition(sourceState, finalState)); } } else if (!containsPositionAssertion) { - assert mergeBuilder.getMatcherBuilder().matchesSomething(); + assert mergeBuilder.getCodePointSet().matchesSomething(); transitionsBuffer.add(createTransition(sourceState, - registerMatcherState(stateSetCC, mergeBuilder.getMatcherBuilder(), finishedLookBehinds, containsPrefixStates))); + registerMatcherState(stateSetCC, mergeBuilder.getCodePointSet(), finishedLookBehinds, containsPrefixStates))); } transitionGBUpdateIndices.clear(); transitionGBClearIndices.clear(); @@ -246,7 +246,7 @@ private NFAStateTransition[] createNFATransitions(NFAState sourceState, ASTStep } private NFAState createFinalState(StateSet stateSet) { - NFAState state = new NFAState((short) stateID.inc(), stateSet, CodePointSet.getFull(), Collections.emptySet(), false); + NFAState state = new NFAState((short) stateID.inc(), stateSet, ast.getEncoding().getFullSet(), Collections.emptySet(), false); assert !nfaStates.containsKey(state.getStateSet()); nfaStates.put(state.getStateSet(), state); return state; diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/TRegexExecRootNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/TRegexExecRootNode.java index f514c3c79f75..4622fbe6588c 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/TRegexExecRootNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/TRegexExecRootNode.java @@ -40,15 +40,12 @@ */ package com.oracle.truffle.regex.tregex.nodes; -import static com.oracle.truffle.regex.tregex.util.DebugUtil.LOG_BAILOUT_MESSAGES; -import static com.oracle.truffle.regex.tregex.util.DebugUtil.LOG_INTERNAL_ERRORS; -import static com.oracle.truffle.regex.tregex.util.DebugUtil.LOG_SWITCH_TO_EAGER; - import com.oracle.truffle.api.CallTarget; import com.oracle.truffle.api.CompilerDirectives; import com.oracle.truffle.api.CompilerDirectives.CompilationFinal; import com.oracle.truffle.api.Truffle; import com.oracle.truffle.api.nodes.Node; +import com.oracle.truffle.api.profiles.ConditionProfile; import com.oracle.truffle.regex.RegexExecRootNode; import com.oracle.truffle.regex.RegexFlags; import com.oracle.truffle.regex.RegexLanguage; @@ -73,6 +70,7 @@ import com.oracle.truffle.regex.tregex.nodes.nfa.TRegexNFAExecutorNode; import com.oracle.truffle.regex.tregex.parser.ast.RegexAST; import com.oracle.truffle.regex.tregex.string.Encodings.Encoding; +import com.oracle.truffle.regex.tregex.util.Loggers; public class TRegexExecRootNode extends RegexExecRootNode implements RegexProfile.TracksRegexProfile { @@ -90,6 +88,7 @@ public class TRegexExecRootNode extends RegexExecRootNode implements RegexProfil private final int numberOfCaptureGroups; private final boolean regressionTestMode; private final boolean backtrackingMode; + private final ConditionProfile inputProfile = ConditionProfile.createBinaryProfile(); @Child private RunRegexSearchNode runnerNode; @@ -148,6 +147,10 @@ public int getNumberOfCaptureGroups() { return numberOfCaptureGroups; } + public ConditionProfile getInputProfile() { + return inputProfile; + } + private boolean validResult(Object input, int fromIndex, RegexResult result) { if (result == NoMatchResult.getInstance()) { return true; @@ -159,7 +162,7 @@ private boolean validResult(Object input, int fromIndex, RegexResult result) { int start = result.getStart(i); int end = result.getEnd(i); if (start > end || (Math.min(start, end) < 0 && Math.max(start, end) >= 0)) { - LOG_INTERNAL_ERRORS.severe(() -> String.format("Regex: %s\nInput: %s\nfromIndex: %d\nINVALID Result: %s", getSource(), input, fromIndex, result)); + Loggers.LOG_INTERNAL_ERRORS.severe(() -> String.format("Regex: %s\nInput: %s\nfromIndex: %d\nINVALID Result: %s", getSource(), input, fromIndex, result)); return false; } } @@ -171,7 +174,7 @@ private boolean backtrackerProducesSameResult(Object input, int fromIndex, Regex if (resultsEqual(result, btResult, getNumberOfCaptureGroups())) { return true; } - LOG_INTERNAL_ERRORS.severe(() -> String.format("Regex: %s\nInput: %s\nfromIndex: %d\nBacktracker Result: %s\nDFA Result: %s", getSource(), input, fromIndex, btResult, result)); + Loggers.LOG_INTERNAL_ERRORS.severe(() -> String.format("Regex: %s\nInput: %s\nfromIndex: %d\nBacktracker Result: %s\nDFA Result: %s", getSource(), input, fromIndex, btResult, result)); return false; } @@ -184,7 +187,8 @@ private boolean nfaProducesSameResult(Object input, int fromIndex, RegexResult r if (resultsEqual(result, btResult, getNumberOfCaptureGroups())) { return true; } - LOG_INTERNAL_ERRORS.severe(() -> String.format("Regex: %s\nInput: %s\nfromIndex: %d\nNFA executor Result: %s\nDFA Result: %s", getSource(), input, fromIndex, btResult, result)); + Loggers.LOG_INTERNAL_ERRORS.severe( + () -> String.format("Regex: %s\nInput: %s\nfromIndex: %d\nNFA executor Result: %s\nDFA Result: %s", getSource(), input, fromIndex, btResult, result)); return false; } @@ -197,7 +201,8 @@ private boolean noSimpleCGLazyDFAProducesSameResult(Object input, int fromIndex, if (resultsEqual(result, noSimpleCGResult, getNumberOfCaptureGroups())) { return true; } - LOG_INTERNAL_ERRORS.severe(() -> String.format("Regex: %s\nInput: %s\nfromIndex: %d\nLazyDFA Result: %s\nSimplCGDFA Result: %s", getSource(), input, fromIndex, noSimpleCGResult, result)); + Loggers.LOG_INTERNAL_ERRORS.severe( + () -> String.format("Regex: %s\nInput: %s\nfromIndex: %d\nLazyDFA Result: %s\nSimplCGDFA Result: %s", getSource(), input, fromIndex, noSimpleCGResult, result)); return false; } @@ -216,7 +221,7 @@ private boolean eagerAndLazyDFAProduceSameResult(Object input, int fromIndex, Re } boolean equal = resultsEqual(lazyResult, eagerResult, getNumberOfCaptureGroups()); if (!equal) { - LOG_INTERNAL_ERRORS.severe(() -> String.format("Regex: %s\nInput: %s\nfromIndex: %d\nLazy Result: %s\nEager Result: %s", getSource(), input, fromIndex, lazyResult, eagerResult)); + Loggers.LOG_INTERNAL_ERRORS.severe(() -> String.format("Regex: %s\nInput: %s\nfromIndex: %d\nLazy Result: %s\nEager Result: %s", getSource(), input, fromIndex, lazyResult, eagerResult)); } return equal; } @@ -275,7 +280,7 @@ private LazyCaptureGroupRegexSearchNode compileLazyDFA(boolean allowSimpleCG) { try { return tRegexCompiler.compileLazyDFAExecutor(((TRegexNFAExecutorNode) nfaNode.getExecutor()).getNFA(), this, allowSimpleCG); } catch (UnsupportedRegexException e) { - LOG_BAILOUT_MESSAGES.fine(() -> e.getReason() + ": " + source); + Loggers.LOG_BAILOUT_MESSAGES.fine(() -> e.getReason() + ": " + source); return LAZY_DFA_BAILED_OUT; } } @@ -287,7 +292,7 @@ private boolean canSwitchToEagerDFA() { private void switchToEagerDFA(RegexProfile profile) { compileEagerDFA(); if (eagerDFANode != EAGER_DFA_BAILED_OUT) { - LOG_SWITCH_TO_EAGER.fine(() -> "regex " + getSource() + ": switching to eager matching." + (profile == null ? "" : " profile: " + profile)); + Loggers.LOG_SWITCH_TO_EAGER.fine(() -> "regex " + getSource() + ": switching to eager matching." + (profile == null ? "" : " profile: " + profile)); runnerNode = insert(eagerDFANode); } } @@ -298,7 +303,7 @@ private void compileEagerDFA() { TRegexDFAExecutorNode executorNode = tRegexCompiler.compileEagerDFAExecutor(getSource()); eagerDFANode = new EagerCaptureGroupRegexSearchNode(createEntryNode(executorNode)); } catch (UnsupportedRegexException e) { - LOG_BAILOUT_MESSAGES.fine(() -> e.getReason() + ": " + source); + Loggers.LOG_BAILOUT_MESSAGES.fine(() -> e.getReason() + ": " + source); eagerDFANode = EAGER_DFA_BAILED_OUT; } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/TRegexExecutorNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/TRegexExecutorNode.java index d742733717f9..220703bf65bb 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/TRegexExecutorNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/TRegexExecutorNode.java @@ -44,6 +44,7 @@ import com.oracle.truffle.api.CompilerDirectives; import com.oracle.truffle.api.CompilerDirectives.CompilationFinal; import com.oracle.truffle.api.nodes.Node; +import com.oracle.truffle.api.profiles.ConditionProfile; import com.oracle.truffle.regex.tregex.string.Encodings; import com.oracle.truffle.regex.tregex.string.Encodings.Encoding; import com.oracle.truffle.regex.tregex.string.Encodings.Encoding.UTF16; @@ -61,6 +62,10 @@ public Encoding getEncoding() { return root.getEncoding(); } + public ConditionProfile getInputProfile() { + return root.getInputProfile(); + } + /** * The length of the {@code input} argument given to * {@link TRegexExecRootNode#execute(Object, int)}. @@ -147,7 +152,7 @@ public boolean inputUTF16IsLowSurrogate(int c) { return UTF16.isLowSurrogate(c, isForward()); } - protected int inputUTF16ToCodePoint(int highSurrogate, int lowSurrogate) { + public int inputUTF16ToCodePoint(int highSurrogate, int lowSurrogate) { return isForward() ? Character.toCodePoint((char) highSurrogate, (char) lowSurrogate) : Character.toCodePoint((char) lowSurrogate, (char) highSurrogate); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/AllTransitionsInOneTreeMatcher.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/AllTransitionsInOneTreeMatcher.java index a5a67af588f8..678a6e49a585 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/AllTransitionsInOneTreeMatcher.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/AllTransitionsInOneTreeMatcher.java @@ -40,75 +40,181 @@ */ package com.oracle.truffle.regex.tregex.nodes.dfa; -import static com.oracle.truffle.api.CompilerDirectives.CompilationFinal; -import static com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; - import com.oracle.truffle.api.CompilerAsserts; +import com.oracle.truffle.api.CompilerDirectives.CompilationFinal; +import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; +import com.oracle.truffle.api.nodes.ExplodeLoop; +import com.oracle.truffle.regex.charset.CompressedCodePointSet; import com.oracle.truffle.regex.charset.Constants; +import com.oracle.truffle.regex.util.BitSets; /** * This class provides an alternative way of calculating the next transition - instead of checking * all transitions in sequential manner, all ranges of all transitions are merged into one sorted * array, which is then searched in tree-recursive fashion. + * + * @see CompressedCodePointSet */ public final class AllTransitionsInOneTreeMatcher { - @CompilationFinal(dimensions = 1) private final int[] sortedRanges; - @CompilationFinal(dimensions = 1) private final short[] rangeTreeSuccessors; + /** + * Data structure for optimized matching of multiple ranges in one lower byte range. + * + * @see CompressedCodePointSet + */ + public static final class AllTransitionsInOneTreeLeafMatcher { + + @CompilationFinal(dimensions = 2) private final long[][] bitSets; + @CompilationFinal(dimensions = 1) private final short[] successors; + @CompilationFinal(dimensions = 1) private final int[] ranges; + + public AllTransitionsInOneTreeLeafMatcher(long[][] bitSets, short[] successors, int[] ranges) { + assert successors.length == bitSets.length + ranges.length + 1; + this.bitSets = bitSets; + this.successors = successors; + this.ranges = ranges; + } + + @Override + @TruffleBoundary + public String toString() { + StringBuilder sb = new StringBuilder("ranges: ").append(rangesToString(ranges)).append("\nbitsets:\n"); + for (int i = 0; i < bitSets.length; i++) { + sb.append(BitSets.toString(bitSets[i])).append("\n"); + } + return sb.toString(); + } + } + + @CompilationFinal(dimensions = 1) private final int[] ranges; + @CompilationFinal(dimensions = 1) private final short[] successors; + @CompilationFinal(dimensions = 1) private final AllTransitionsInOneTreeLeafMatcher[] leafMatchers; /** * Constructs a new {@link AllTransitionsInOneTreeMatcher}. * - * @param sortedRanges a sorted list of adjacent character ranges, in the following format: - * Every character in the array simultaneously represents the inclusive lower bound - * of a range and the exclusive upper bound of a range. The algorithm adds an - * implicit zero at the begin and an implicit {@link Constants#MAX_CODE_POINT} + 1 at - * the end of the array. An array representing the ranges - * {@code [0x00-0x10][0x10-0xff][0xff-0x2000][0x2000-0x10000]} (represented with + * @param ranges a sorted list of adjacent character ranges, in the following format: Every + * character in the array simultaneously represents the inclusive lower bound of a + * range and the exclusive upper bound of a range. The algorithm adds an implicit + * zero at the begin and an implicit {@link Constants#MAX_CODE_POINT} + 1 at the end + * of the array. An array representing the ranges + * {@code [0x00-0x10][0x10-0xff][0xff-0x2000][0x2000-0x10ffff]} (represented with * exclusive upper bound) would be: {@code [0x10, 0xff, 0x2000]}. - * @param rangeTreeSuccessors the list of successors corresponding to every range in the sorted - * list of ranges. every entry in this array is an index of - * {@link DFAStateNode#getSuccessors()}. + * @param successors the list of successors corresponding to every range in the sorted list of + * ranges. Every entry in this array is an index of + * {@link DFAStateNode#getSuccessors()}, or a negative index. A negative index can + * mean one of two things: {@code -1} denotes "no successor", indices below + * {@code -1} denote {@link AllTransitionsInOneTreeLeafMatcher leaf matchers}. These + * specialized matchers are used when many ranges lie in the same lower byte range, + * i.e. all bytes of their numerical values except the lowest one are equal (e.g. + * {@code [0x2020-0x2021][0x2030-0x2031]...}). */ - public AllTransitionsInOneTreeMatcher(int[] sortedRanges, short[] rangeTreeSuccessors) { - assert sortedRanges.length > 0 : "This class should never be used for trivial transitions, use a list of CharMatchers instead!"; - assert rangeTreeSuccessors.length == sortedRanges.length + 1; - this.sortedRanges = sortedRanges; - this.rangeTreeSuccessors = rangeTreeSuccessors; + public AllTransitionsInOneTreeMatcher(int[] ranges, short[] successors, AllTransitionsInOneTreeLeafMatcher[] leafMatchers) { + assert successors.length == ranges.length + 1; + this.ranges = ranges; + this.successors = successors; + this.leafMatchers = leafMatchers; } public int checkMatchTree(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, DFAStateNode stateNode, int c) { CompilerAsserts.partialEvaluationConstant(this); CompilerAsserts.partialEvaluationConstant(stateNode); - return checkMatchTree(locals, executor, stateNode, 0, sortedRanges.length - 1, c); + return checkMatchTree(locals, executor, stateNode, 0, ranges.length - 1, c); } + /** + * Recursive binary-search through {@code ranges}. + */ private int checkMatchTree(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, DFAStateNode stateNode, int fromIndex, int toIndex, int c) { CompilerAsserts.partialEvaluationConstant(stateNode); CompilerAsserts.partialEvaluationConstant(fromIndex); CompilerAsserts.partialEvaluationConstant(toIndex); if (fromIndex > toIndex) { - final short successor = rangeTreeSuccessors[fromIndex]; - if (successor != DFAStateNode.FS_RESULT_NO_SUCCESSOR) { + final short successor = successors[fromIndex]; + if (successor == -1) { + return successor; + } else if (successor < -1) { + return checkMatchLeaf((successor * -1) - 2, locals, executor, stateNode, c); + } else { stateNode.successorFound(locals, executor, successor); } return successor; } final int mid = (fromIndex + toIndex) >>> 1; CompilerAsserts.partialEvaluationConstant(mid); - if (c < sortedRanges[mid]) { + if (c < ranges[mid]) { return checkMatchTree(locals, executor, stateNode, fromIndex, mid - 1, c); } else { return checkMatchTree(locals, executor, stateNode, mid + 1, toIndex, c); } } + /** + * The search has been narrowed down to a byte range, continue in a leaf matcher. Here, we first + * check all bit sets, and if none match, we check the remaining ranges that did not get + * converted to bit sets. + */ + @ExplodeLoop + private int checkMatchLeaf(int iLeaf, TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, DFAStateNode stateNode, int c) { + CompilerAsserts.partialEvaluationConstant(iLeaf); + AllTransitionsInOneTreeLeafMatcher leafMatcher = leafMatchers[iLeaf]; + int lowByte = BitSets.lowByte(c); + for (int i = 0; i < leafMatcher.bitSets.length; i++) { + CompilerAsserts.partialEvaluationConstant(i); + if (BitSets.get(leafMatcher.bitSets[i], lowByte)) { + final short successor = leafMatcher.successors[i]; + CompilerAsserts.partialEvaluationConstant(successor); + stateNode.successorFound(locals, executor, successor); + return successor; + } + } + return checkMatchLeafSubTree(locals, executor, stateNode, leafMatcher, 0, leafMatcher.ranges.length - 1, c); + } + + /** + * Recursive binary-search through {@code ranges} of a {@link AllTransitionsInOneTreeLeafMatcher + * leaf matcher}. + */ + private static int checkMatchLeafSubTree(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, DFAStateNode stateNode, + AllTransitionsInOneTreeLeafMatcher leafMatcher, int fromIndex, int toIndex, int c) { + CompilerAsserts.partialEvaluationConstant(leafMatcher); + CompilerAsserts.partialEvaluationConstant(fromIndex); + CompilerAsserts.partialEvaluationConstant(toIndex); + if (fromIndex > toIndex) { + final short successor = leafMatcher.successors[leafMatcher.bitSets.length + fromIndex]; + CompilerAsserts.partialEvaluationConstant(successor); + if (successor == -1) { + int lo = fromIndex == 0 ? 0 : leafMatcher.ranges[fromIndex - 1]; + int hi = fromIndex == leafMatcher.ranges.length ? Character.MAX_CODE_POINT + 1 : leafMatcher.ranges[fromIndex]; + CompilerAsserts.partialEvaluationConstant(lo); + CompilerAsserts.partialEvaluationConstant(hi); + // TODO: move bitset matches here. requires PE intrinsic. + return successor; + } else { + stateNode.successorFound(locals, executor, successor); + return successor; + } + } + final int mid = (fromIndex + toIndex) >>> 1; + CompilerAsserts.partialEvaluationConstant(mid); + if (c < leafMatcher.ranges[mid]) { + return checkMatchLeafSubTree(locals, executor, stateNode, leafMatcher, fromIndex, mid - 1, c); + } else { + return checkMatchLeafSubTree(locals, executor, stateNode, leafMatcher, mid + 1, toIndex, c); + } + } + @TruffleBoundary @Override public String toString() { - StringBuilder sb = new StringBuilder("AllTransitionsInOneTreeMatcher: ["); + return "AllTransitionsInOneTreeMatcher: " + rangesToString(ranges); + } + + @TruffleBoundary + private static String rangesToString(int[] ranges) { + StringBuilder sb = new StringBuilder("["); boolean first = true; - for (int c : sortedRanges) { + for (int c : ranges) { if (first) { first = false; } else { diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/BackwardDFAStateNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/BackwardDFAStateNode.java index 19d9ed4e5e68..53d422003560 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/BackwardDFAStateNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/BackwardDFAStateNode.java @@ -40,13 +40,11 @@ */ package com.oracle.truffle.regex.tregex.nodes.dfa; -import com.oracle.truffle.regex.tregex.matchers.CharMatcher; - public class BackwardDFAStateNode extends DFAStateNode { - public BackwardDFAStateNode(short id, byte flags, LoopOptimizationNode loopOptimizationNode, short[] successors, CharMatcher[] matchers, DFASimpleCG simpleCG, + public BackwardDFAStateNode(short id, byte flags, short loopTransitionIndex, LoopOptimizationNode loopOptimizationNode, short[] successors, Matchers matchers, DFASimpleCG simpleCG, AllTransitionsInOneTreeMatcher allTransitionsInOneTreeMatcher) { - super(id, flags, loopOptimizationNode, successors, matchers, simpleCG, allTransitionsInOneTreeMatcher); + super(id, flags, loopTransitionIndex, loopOptimizationNode, successors, matchers, simpleCG, allTransitionsInOneTreeMatcher); } protected BackwardDFAStateNode(BackwardDFAStateNode copy, short copyID) { diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/CGTrackingDFAStateNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/CGTrackingDFAStateNode.java index 9085af9c4bcf..ff4af79f1ea4 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/CGTrackingDFAStateNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/CGTrackingDFAStateNode.java @@ -42,10 +42,6 @@ import com.oracle.truffle.api.CompilerAsserts; import com.oracle.truffle.api.CompilerDirectives.CompilationFinal; -import com.oracle.truffle.api.nodes.ExplodeLoop; -import com.oracle.truffle.regex.tregex.matchers.CharMatcher; -import com.oracle.truffle.regex.tregex.nodes.TRegexExecutorLocals; -import com.oracle.truffle.regex.tregex.nodes.TRegexExecutorNode; public class CGTrackingDFAStateNode extends DFAStateNode { @@ -58,12 +54,12 @@ public class CGTrackingDFAStateNode extends DFAStateNode { @Child private DFACaptureGroupPartialTransitionDispatchNode transitionDispatchNode; - public CGTrackingDFAStateNode(short id, byte flags, LoopOptimizationNode loopOptimizationNode, short[] successors, CharMatcher[] matchers, + public CGTrackingDFAStateNode(short id, byte flags, short loopTransitionIndex, LoopOptimizationNode loopOptimizationNode, short[] successors, Matchers matchers, AllTransitionsInOneTreeMatcher allTransitionsInOneTreeMatcher, short[] captureGroupTransitions, short[] precedingCaptureGroupTransitions, DFACaptureGroupPartialTransition anchoredFinalStateTransition, DFACaptureGroupPartialTransition unAnchoredFinalStateTransition) { - super(id, flags, loopOptimizationNode, successors, matchers, null, allTransitionsInOneTreeMatcher); + super(id, flags, loopTransitionIndex, loopOptimizationNode, successors, matchers, null, allTransitionsInOneTreeMatcher); this.captureGroupTransitions = captureGroupTransitions; this.precedingCaptureGroupTransitions = precedingCaptureGroupTransitions; transitionDispatchNode = precedingCaptureGroupTransitions.length > 1 ? DFACaptureGroupPartialTransitionDispatchNode.create(precedingCaptureGroupTransitions) : null; @@ -100,15 +96,13 @@ public void executeFindSuccessor(TRegexDFAExecutorLocals locals, TRegexDFAExecut } if (treeTransitionMatching()) { doTreeMatch(locals, executor, compactString); + executor.inputAdvance(locals); return; } - if (checkMatch(locals, executor, compactString)) { + if (checkMatchAndAdvance(locals, executor, compactString)) { final int preLoopIndex = locals.getIndex(); if (doIndexof(executor)) { - int indexOfResult = loopOptimizationNode.getIndexOfNode().execute(locals.getInput(), - locals.getIndex(), - executor.getMaxIndex(locals), - loopOptimizationNode.getIndexOfChars()); + int indexOfResult = loopOptimizationNode.execute(locals.getInput(), locals.getIndex(), executor.getMaxIndex(locals)); indexofApplyLoopReorders(locals, executor, preLoopIndex, indexOfResult < 0 ? executor.getMaxIndex(locals) : indexOfResult); if (indexOfResult < 0) { locals.setSuccessorIndex(atEndLoop(locals, executor, preLoopIndex)); @@ -123,7 +117,7 @@ public void executeFindSuccessor(TRegexDFAExecutorLocals locals, TRegexDFAExecut } } while (executor.inputHasNext(locals)) { - if (!checkMatchLoop(locals, executor, compactString, preLoopIndex)) { + if (!checkMatchLoopAndAdvance(locals, executor, compactString, preLoopIndex)) { return; } } @@ -131,8 +125,20 @@ public void executeFindSuccessor(TRegexDFAExecutorLocals locals, TRegexDFAExecut } } + private boolean checkMatchAndAdvance(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, boolean compactString) { + boolean match = checkMatch(locals, executor, compactString, false, 0); + executor.inputAdvance(locals); + return match; + } + + private boolean checkMatchLoopAndAdvance(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, boolean compactString, int preLoopIndex) { + boolean match = checkMatch(locals, executor, compactString, true, preLoopIndex); + executor.inputAdvance(locals); + return match; + } + private boolean doIndexof(TRegexDFAExecutorNode executor) { - return executor.isForward() && hasLoopToSelf() && loopOptimizationNode.getIndexOfChars() != null; + return executor.isForward() && hasLoopToSelf() && loopOptimizationNode != null; } private void indexofApplyLoopReorders(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, final int preLoopIndex, int postLoopIndex) { @@ -153,86 +159,6 @@ private void indexofApplyLoopReorders(TRegexDFAExecutorLocals locals, TRegexDFAE assert locals.getIndex() == postLoopIndex; } - private void doTreeMatch(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, boolean compactString) { - final int c = executor.inputRead(locals); - int successor = getTreeMatcher().checkMatchTree(locals, executor, this, c); - assert sameResultAsRegularMatchers(executor, c, compactString, successor) : this.toString(); - locals.setSuccessorIndex(successor); - executor.inputAdvance(locals); - } - - /** - * Finds the first matching transition. If a transition matches, - * {@link #successorFound(TRegexDFAExecutorLocals, TRegexDFAExecutorNode, int)} is called. The - * index of the element of {@link #getMatchers()} that matched the current input character ( - * {@link TRegexExecutorNode#inputRead(TRegexExecutorLocals)}) or - * {@link #FS_RESULT_NO_SUCCESSOR} is stored via - * {@link TRegexDFAExecutorLocals#setSuccessorIndex(int)}. - * - * @param locals a virtual frame as described by {@link TRegexDFAExecutorProperties}. - * @param executor this node's parent {@link TRegexDFAExecutorNode}. - * @param compactString {@code true} if the input string is a compact string, must be partial - * evaluation constant. - * @return {@code true} if the matching transition loops back to this state, {@code false} - * otherwise. - */ - @ExplodeLoop(kind = ExplodeLoop.LoopExplosionKind.FULL_EXPLODE_UNTIL_RETURN) - private boolean checkMatch(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, boolean compactString) { - final int c = executor.inputRead(locals); - for (int i = 0; i < matchers.length; i++) { - if (matchers[i].execute(c, compactString)) { - CompilerAsserts.partialEvaluationConstant(i); - successorFound(locals, executor, i); - locals.setSuccessorIndex(i); - executor.inputAdvance(locals); - return isLoopToSelf(i); - } - } - locals.setSuccessorIndex(FS_RESULT_NO_SUCCESSOR); - executor.inputAdvance(locals); - return false; - } - - /** - * Finds the first matching transition. This method is called only if the transitions found by - * {@link #checkMatch(TRegexDFAExecutorLocals, TRegexDFAExecutorNode, boolean)} was a loop back - * to this state (indicated by {@link #isLoopToSelf(int)}), and will be called in a loop until a - * transition other than the loop back transition matches. If a transition other than the - * looping transition matches, - * {@link #successorFoundLoop(TRegexDFAExecutorLocals, TRegexDFAExecutorNode, int, int)} is - * called. The index of the element of {@link #getMatchers()} that matched the current input - * character ( {@link TRegexExecutorNode#inputRead(TRegexExecutorLocals)}) or - * {@link #FS_RESULT_NO_SUCCESSOR} is stored via - * {@link TRegexDFAExecutorLocals#setSuccessorIndex(int)}. If no transition matches, - * {@link #noSuccessorLoop(TRegexDFAExecutorLocals, TRegexDFAExecutorNode, int)} is called. - * - * @param locals a virtual frame as described by {@link TRegexDFAExecutorProperties}. - * @param executor this node's parent {@link TRegexDFAExecutorNode}. - * @param compactString {@code true} if the input string is a compact string, must be partial - * evaluation constant. - * @param preLoopIndex the index pointed to by {@link TRegexDFAExecutorLocals#getIndex()} - * before this method is called for the first time. - * @return {@code true} if the matching transition loops back to this state, {@code false} - * otherwise. - */ - @ExplodeLoop(kind = ExplodeLoop.LoopExplosionKind.FULL_EXPLODE_UNTIL_RETURN) - private boolean checkMatchLoop(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, boolean compactString, int preLoopIndex) { - final int c = executor.inputRead(locals); - for (int i = 0; i < matchers.length; i++) { - if (matchers[i].execute(c, compactString)) { - CompilerAsserts.partialEvaluationConstant(i); - successorFoundLoop(locals, executor, i, preLoopIndex); - locals.setSuccessorIndex(i); - executor.inputAdvance(locals); - return isLoopToSelf(i); - } - } - locals.setSuccessorIndex(FS_RESULT_NO_SUCCESSOR); - noSuccessorLoop(locals, executor, preLoopIndex); - executor.inputAdvance(locals); - return false; - } - private void beforeFindSuccessor(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor) { CompilerAsserts.partialEvaluationConstant(this); if (executor.isSearching()) { @@ -275,7 +201,8 @@ private boolean canSkipPartialTransitionsOfLoop(TRegexDFAExecutorNode executor) return ret; } - private void successorFoundLoop(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, int i, int preLoopIndex) { + @Override + void successorFoundLoop(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, int i, int preLoopIndex) { CompilerAsserts.partialEvaluationConstant(this); CompilerAsserts.partialEvaluationConstant(i); if (!isLoopToSelf(i)) { @@ -290,7 +217,8 @@ private void successorFoundLoop(TRegexDFAExecutorLocals locals, TRegexDFAExecuto } } - private void noSuccessorLoop(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, int preLoopIndex) { + @Override + void noSuccessorLoop(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, int preLoopIndex) { CompilerAsserts.partialEvaluationConstant(this); assert executor.isSearching(); applyLoopTransitions(locals, executor, preLoopIndex); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFAFindInnerLiteralStateNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFAFindInnerLiteralStateNode.java index 00b97926c177..51778acc373d 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFAFindInnerLiteralStateNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFAFindInnerLiteralStateNode.java @@ -74,7 +74,7 @@ public void executeFindSuccessor(TRegexDFAExecutorLocals locals, TRegexDFAExecut locals.setSuccessorIndex(FS_RESULT_NO_SUCCESSOR); return; } - locals.setIndex(indexOfNode.execute(locals.getInput(), locals.getIndex(), executor.getMaxIndex(locals), innerLiteral.getLiteral(), innerLiteral.getMask())); + locals.setIndex(indexOfNode.execute(locals.getInput(), locals.getIndex(), executor.getMaxIndex(locals), innerLiteral.getLiteral().content(), innerLiteral.getMaskContent())); if (locals.getIndex() < 0) { locals.setSuccessorIndex(FS_RESULT_NO_SUCCESSOR); return; diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFAStateNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFAStateNode.java index 991ca258e542..a074108bbdd9 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFAStateNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFAStateNode.java @@ -52,7 +52,14 @@ import com.oracle.truffle.regex.tregex.matchers.CharMatcher; import com.oracle.truffle.regex.tregex.nodes.TRegexExecutorLocals; import com.oracle.truffle.regex.tregex.nodes.TRegexExecutorNode; +import com.oracle.truffle.regex.tregex.nodes.dfa.Matchers.SimpleMatchers; +import com.oracle.truffle.regex.tregex.nodes.dfa.Matchers.UTF16Matchers; +import com.oracle.truffle.regex.tregex.nodes.dfa.Matchers.UTF16RawMatchers; +import com.oracle.truffle.regex.tregex.nodes.dfa.Matchers.UTF8Matchers; import com.oracle.truffle.regex.tregex.nodes.input.InputIndexOfNode; +import com.oracle.truffle.regex.tregex.nodes.input.InputIndexOfStringNode; +import com.oracle.truffle.regex.tregex.string.AbstractString; +import com.oracle.truffle.regex.tregex.string.Encodings; import com.oracle.truffle.regex.tregex.util.DebugUtil; import com.oracle.truffle.regex.tregex.util.json.Json; import com.oracle.truffle.regex.tregex.util.json.JsonArray; @@ -60,33 +67,114 @@ public class DFAStateNode extends DFAAbstractStateNode { - public static class LoopOptimizationNode extends Node { + public abstract static class LoopOptimizationNode extends Node { + + public abstract int execute(Object input, int preLoopIndex, int maxIndex); + + public abstract int encodedLength(); + + abstract LoopOptimizationNode nodeSplitCopy(); + } + + public abstract static class LoopOptIndexOfAnyNode extends LoopOptimizationNode { - private final short loopTransitionIndex; - @CompilationFinal(dimensions = 1) private final char[] indexOfChars; @Child private InputIndexOfNode indexOfNode; - public LoopOptimizationNode(short loopTransitionIndex, char[] indexOfChars) { - this.loopTransitionIndex = loopTransitionIndex; - this.indexOfChars = indexOfChars; + @Override + public int encodedLength() { + return 1; + } + + InputIndexOfNode getIndexOfNode() { + if (indexOfNode == null) { + CompilerDirectives.transferToInterpreterAndInvalidate(); + indexOfNode = insert(InputIndexOfNode.create()); + } + return indexOfNode; + } + } + + public static final class LoopOptIndexOfAnyCharNode extends LoopOptIndexOfAnyNode { + + @CompilationFinal(dimensions = 1) private final char[] chars; + + public LoopOptIndexOfAnyCharNode(char[] chars) { + this.chars = chars; + } + + private LoopOptIndexOfAnyCharNode(LoopOptIndexOfAnyCharNode copy) { + this.chars = copy.chars; + } + + @Override + public int execute(Object input, int fromIndex, int maxIndex) { + return getIndexOfNode().execute(input, fromIndex, maxIndex, chars); + } + + @Override + LoopOptimizationNode nodeSplitCopy() { + return new LoopOptIndexOfAnyCharNode(this); + } + } + + public static final class LoopOptIndexOfAnyByteNode extends LoopOptIndexOfAnyNode { + + @CompilationFinal(dimensions = 1) private final byte[] bytes; + + public LoopOptIndexOfAnyByteNode(byte[] bytes) { + this.bytes = bytes; + } + + private LoopOptIndexOfAnyByteNode(LoopOptIndexOfAnyByteNode copy) { + this.bytes = copy.bytes; + } + + @Override + public int execute(Object input, int fromIndex, int maxIndex) { + return getIndexOfNode().execute(input, fromIndex, maxIndex, bytes); } - private LoopOptimizationNode nodeSplitCopy() { - return new LoopOptimizationNode(loopTransitionIndex, indexOfChars); + @Override + LoopOptimizationNode nodeSplitCopy() { + return new LoopOptIndexOfAnyByteNode(this); } + } + + public static final class LoopOptIndexOfStringNode extends LoopOptimizationNode { + + private final AbstractString str; + private final AbstractString mask; + @Child private InputIndexOfStringNode indexOfNode; - public char[] getIndexOfChars() { - return indexOfChars; + public LoopOptIndexOfStringNode(AbstractString str, AbstractString mask) { + this.str = str; + this.mask = mask; } + private LoopOptIndexOfStringNode(LoopOptIndexOfStringNode copy) { + this.str = copy.str; + this.mask = copy.mask; + } + + @Override + public int execute(Object input, int fromIndex, int maxIndex) { + return getIndexOfNode().execute(input, fromIndex, maxIndex, str.content(), mask == null ? null : mask.content()); + } + + @Override public int encodedLength() { - return 1; + return str.encodedLength(); + } + + @Override + LoopOptimizationNode nodeSplitCopy() { + return new LoopOptIndexOfStringNode(this); } - public InputIndexOfNode getIndexOfNode() { + private InputIndexOfStringNode getIndexOfNode() { if (indexOfNode == null) { CompilerDirectives.transferToInterpreterAndInvalidate(); - indexOfNode = insert(InputIndexOfNode.create()); + indexOfNode = insert(InputIndexOfStringNode.create()); } return indexOfNode; } @@ -95,32 +183,35 @@ public InputIndexOfNode getIndexOfNode() { private static final byte FLAG_FINAL_STATE = 1; private static final byte FLAG_ANCHORED_FINAL_STATE = 1 << 1; private static final byte FLAG_HAS_BACKWARD_PREFIX_STATE = 1 << 2; + private static final byte FLAG_UTF_16_MUST_DECODE = 1 << 3; private final byte flags; + private final short loopTransitionIndex; @Child LoopOptimizationNode loopOptimizationNode; - @Children protected final CharMatcher[] matchers; + @Child Matchers matchers; private final DFASimpleCG simpleCG; private final AllTransitionsInOneTreeMatcher allTransitionsInOneTreeMatcher; private final BranchProfile stateReachedProfile = BranchProfile.create(); DFAStateNode(DFAStateNode nodeSplitCopy, short copyID) { - this(copyID, nodeSplitCopy.flags, nodeSplitCopy.loopOptimizationNode.nodeSplitCopy(), + this(copyID, nodeSplitCopy.flags, nodeSplitCopy.loopTransitionIndex, nodeSplitCopy.loopOptimizationNode.nodeSplitCopy(), Arrays.copyOf(nodeSplitCopy.getSuccessors(), nodeSplitCopy.getSuccessors().length), nodeSplitCopy.getMatchers(), nodeSplitCopy.simpleCG, nodeSplitCopy.allTransitionsInOneTreeMatcher); } - public DFAStateNode(short id, byte flags, LoopOptimizationNode loopOptimizationNode, short[] successors, CharMatcher[] matchers, DFASimpleCG simpleCG, + public DFAStateNode(short id, byte flags, short loopTransitionIndex, LoopOptimizationNode loopOptimizationNode, short[] successors, Matchers matchers, DFASimpleCG simpleCG, AllTransitionsInOneTreeMatcher allTransitionsInOneTreeMatcher) { super(id, successors); assert id > 0; this.flags = flags; + this.loopTransitionIndex = loopTransitionIndex; this.loopOptimizationNode = loopOptimizationNode; this.matchers = matchers; this.simpleCG = simpleCG; this.allTransitionsInOneTreeMatcher = allTransitionsInOneTreeMatcher; } - public static byte buildFlags(boolean finalState, boolean anchoredFinalState, boolean hasBackwardPrefixState) { + public static byte buildFlags(boolean finalState, boolean anchoredFinalState, boolean hasBackwardPrefixState, boolean utf16MustDecode) { byte flags = 0; if (finalState) { flags |= FLAG_FINAL_STATE; @@ -131,19 +222,18 @@ public static byte buildFlags(boolean finalState, boolean anchoredFinalState, bo if (hasBackwardPrefixState) { flags |= FLAG_HAS_BACKWARD_PREFIX_STATE; } + if (utf16MustDecode) { + flags |= FLAG_UTF_16_MUST_DECODE; + } return flags; } - public static LoopOptimizationNode buildLoopOptimizationNode(short loopTransitionIndex, char[] indexOfChars) { - return new LoopOptimizationNode(loopTransitionIndex, indexOfChars); - } - @Override public DFAStateNode createNodeSplitCopy(short copyID) { return new DFAStateNode(this, copyID); } - public final CharMatcher[] getMatchers() { + public final Matchers getMatchers() { return matchers; } @@ -163,12 +253,16 @@ public boolean hasBackwardPrefixState() { return flagIsSet(FLAG_HAS_BACKWARD_PREFIX_STATE); } + public boolean utf16MustDecode() { + return flagIsSet(FLAG_UTF_16_MUST_DECODE); + } + private boolean flagIsSet(byte flag) { return (flags & flag) != 0; } public boolean hasLoopToSelf() { - return loopOptimizationNode != null; + return loopTransitionIndex >= 0; } boolean isLoopToSelf(int transitionIndex) { @@ -177,7 +271,7 @@ boolean isLoopToSelf(int transitionIndex) { short getLoopToSelf() { assert hasLoopToSelf(); - return loopOptimizationNode.loopTransitionIndex; + return loopTransitionIndex; } boolean treeTransitionMatching() { @@ -188,19 +282,6 @@ AllTransitionsInOneTreeMatcher getTreeMatcher() { return allTransitionsInOneTreeMatcher; } - boolean sameResultAsRegularMatchers(TRegexDFAExecutorNode executor, int c, boolean compactString, int allTransitionsMatcherResult) { - CompilerAsserts.neverPartOfCompilation(); - if (executor.isRegressionTestMode()) { - for (int i = 0; i < matchers.length; i++) { - if (matchers[i].execute(c, compactString)) { - return i == allTransitionsMatcherResult; - } - } - return allTransitionsMatcherResult == -1; - } - return true; - } - /** * Calculates this state's successor by finding a transition that matches the current input. If * the successor is the state itself, this method continues consuming input characters until a @@ -217,7 +298,7 @@ public void executeFindSuccessor(TRegexDFAExecutorLocals locals, TRegexDFAExecut CompilerAsserts.partialEvaluationConstant(this); CompilerAsserts.partialEvaluationConstant(compactString); if (hasLoopToSelf()) { - if (executor.isForward() && loopOptimizationNode.indexOfChars != null) { + if (executor.isForward() && loopOptimizationNode != null) { runIndexOf(locals, executor, compactString); } else { while (executor.inputHasNext(locals)) { @@ -226,7 +307,7 @@ public void executeFindSuccessor(TRegexDFAExecutorLocals locals, TRegexDFAExecut // simpleCG mode checkFinalState(locals, executor); } - if (!checkMatch(locals, executor, compactString)) { + if (!checkMatchOrTree(locals, executor, compactString)) { if (!executor.isSimpleCG()) { // in ignore-capture-groups mode, we can delay the final state check checkFinalState(locals, executor); @@ -244,7 +325,7 @@ public void executeFindSuccessor(TRegexDFAExecutorLocals locals, TRegexDFAExecut return; } checkFinalState(locals, executor); - checkMatch(locals, executor, compactString); + checkMatchOrTree(locals, executor, compactString); executor.inputAdvance(locals); } } @@ -252,10 +333,7 @@ public void executeFindSuccessor(TRegexDFAExecutorLocals locals, TRegexDFAExecut private void runIndexOf(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, boolean compactString) { assert executor.isForward(); final int preLoopIndex = locals.getIndex(); - int indexOfResult = loopOptimizationNode.getIndexOfNode().execute(locals.getInput(), - preLoopIndex, - executor.getMaxIndex(locals), - loopOptimizationNode.indexOfChars); + int indexOfResult = loopOptimizationNode.execute(locals.getInput(), preLoopIndex, executor.getMaxIndex(locals)); locals.setIndex(indexOfResult < 0 ? executor.getMaxIndex(locals) : indexOfResult); if (simpleCG != null && locals.getIndex() > preLoopIndex) { int curIndex = locals.getIndex(); @@ -276,12 +354,36 @@ private void runIndexOf(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode ex executor.inputIncRaw(locals, loopOptimizationNode.encodedLength()); locals.setSuccessorIndex(successor); } else { - checkMatch(locals, executor, compactString); + checkMatchOrTree(locals, executor, compactString); executor.inputAdvance(locals); } } } + private boolean checkMatchOrTree(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, boolean compactString) { + if (treeTransitionMatching()) { + return doTreeMatch(locals, executor, compactString); + } else { + return checkMatch(locals, executor, compactString, false, 0); + } + } + + boolean doTreeMatch(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, boolean compactString) { + final int c = executor.inputRead(locals); + int successor = getTreeMatcher().checkMatchTree(locals, executor, this, c); + assert sameResultAsRegularMatchers(executor, c, compactString, successor) : this.toString(); + locals.setSuccessorIndex(successor); + return isLoopToSelf(successor); + } + + boolean sameResultAsRegularMatchers(TRegexDFAExecutorNode executor, int c, boolean compactString, int allTransitionsMatcherResult) { + CompilerAsserts.neverPartOfCompilation(); + if (executor.isRegressionTestMode()) { + return allTransitionsMatcherResult == matchers.match(c, compactString); + } + return true; + } + /** * Finds the first matching transition. The index of the element of {@link #getMatchers()} that * matched the current input character ( @@ -297,27 +399,170 @@ private void runIndexOf(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode ex * otherwise. */ @ExplodeLoop(kind = ExplodeLoop.LoopExplosionKind.FULL_EXPLODE_UNTIL_RETURN) - private boolean checkMatch(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, boolean compactString) { - final int c = executor.inputRead(locals); - if (treeTransitionMatching()) { - int successor = getTreeMatcher().checkMatchTree(locals, executor, this, c); - assert sameResultAsRegularMatchers(executor, c, compactString, successor) : this.toString(); - locals.setSuccessorIndex(successor); - return isLoopToSelf(successor); + boolean checkMatch(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, boolean compactString, boolean loopMode, int preLoopIndex) { + CompilerAsserts.partialEvaluationConstant(loopMode); + if (matchers instanceof SimpleMatchers) { + final int c = executor.inputRead(locals); + CharMatcher[] cMatchers = ((SimpleMatchers) matchers).getMatchers(); + if (cMatchers != null) { + for (int i = 0; i < cMatchers.length; i++) { + if (match(cMatchers, i, c, compactString)) { + return checkMatchSuccessorFoundHook(locals, executor, loopMode, preLoopIndex, i); + } + } + } + } else if (matchers instanceof UTF8Matchers) { + UTF8Matchers utf8Matchers = (UTF8Matchers) matchers; + CharMatcher[] ascii = utf8Matchers.getAscii(); + CharMatcher[] enc2 = utf8Matchers.getEnc2(); + CharMatcher[] enc3 = utf8Matchers.getEnc2(); + CharMatcher[] enc4 = utf8Matchers.getEnc2(); + + final int c = executor.inputReadRaw(locals); + + if (executor.isForward()) { + if (executor.getInputProfile().profile(c < 128)) { + locals.setNextIndex(executor.inputIncRaw(locals.getIndex())); + if (ascii != null) { + for (int i = 0; i < ascii.length; i++) { + if (match(ascii, i, c, compactString)) { + return checkMatchSuccessorFoundHook(locals, executor, loopMode, preLoopIndex, i); + } + } + } + } else { + int nBytes = Integer.numberOfLeadingZeros(~(c << 24)); + assert 1 < nBytes && nBytes < 5 : nBytes; + locals.setNextIndex(executor.inputIncRaw(nBytes)); + int codepoint; + switch (nBytes) { + case 2: + locals.setNextIndex(executor.inputIncRaw(2)); + if (enc2 != null) { + codepoint = ((c & 0x3f) << 6) | (executor.inputReadRaw(locals, locals.getIndex() + 1) & 0x3f); + for (int i = 0; i < enc2.length; i++) { + if (match(enc2, i, codepoint, compactString)) { + return checkMatchSuccessorFoundHook(locals, executor, loopMode, preLoopIndex, i); + } + } + } + break; + case 3: + locals.setNextIndex(executor.inputIncRaw(3)); + if (enc3 != null) { + codepoint = ((c & 0x1f) << 12) | ((executor.inputReadRaw(locals, locals.getIndex() + 1) & 0x3f) << 6) | (executor.inputReadRaw(locals, locals.getIndex() + 2) & 0x3f); + for (int i = 0; i < enc3.length; i++) { + if (match(enc3, i, codepoint, compactString)) { + return checkMatchSuccessorFoundHook(locals, executor, loopMode, preLoopIndex, i); + } + } + } + break; + case 4: + locals.setNextIndex(executor.inputIncRaw(4)); + if (enc4 != null) { + codepoint = ((c & 0x0f) << 18) | + ((executor.inputReadRaw(locals, locals.getIndex() + 1) & 0x3f) << 6) | + ((executor.inputReadRaw(locals, locals.getIndex() + 2) & 0x3f) << 12) | + (executor.inputReadRaw(locals, locals.getIndex() + 3) & 0x3f); + for (int i = 0; i < enc4.length; i++) { + if (match(enc4, i, codepoint, compactString)) { + return checkMatchSuccessorFoundHook(locals, executor, loopMode, preLoopIndex, i); + } + } + } + break; + } + } + } + + } else if (matchers instanceof UTF16RawMatchers) { + final int c = executor.inputRead(locals); + CharMatcher[] latin1 = ((UTF16RawMatchers) matchers).getLatin1(); + CharMatcher[] bmp = ((UTF16RawMatchers) matchers).getBmp(); + if (latin1 != null && (bmp == null || compactString || executor.getInputProfile().profile(c < 256))) { + for (int i = 0; i < latin1.length; i++) { + if (match(latin1, i, c, compactString)) { + return checkMatchSuccessorFoundHook(locals, executor, loopMode, preLoopIndex, i); + } + } + } else if (bmp != null) { + for (int i = 0; i < bmp.length; i++) { + if (match(bmp, i, c, compactString)) { + return checkMatchSuccessorFoundHook(locals, executor, loopMode, preLoopIndex, i); + } + } + } } else { - for (int i = 0; i < matchers.length; i++) { - if (matchers[i].execute(c, compactString)) { - CompilerAsserts.partialEvaluationConstant(i); - locals.setSuccessorIndex(i); - successorFound(locals, executor, i); - return isLoopToSelf(i); + assert matchers instanceof UTF16Matchers; + assert executor.getEncoding() == Encodings.UTF_16; + UTF16Matchers utf16Matchers = (UTF16Matchers) matchers; + CharMatcher[] latin1 = utf16Matchers.getLatin1(); + CharMatcher[] bmp = utf16Matchers.getBmp(); + CharMatcher[] astral = utf16Matchers.getAstral(); + + locals.setNextIndex(executor.inputIncRaw(locals.getIndex())); + int c = executor.inputReadRaw(locals); + + if (utf16MustDecode() && executor.inputUTF16IsHighSurrogate(c) && executor.inputHasNext(locals, locals.getNextIndex())) { + int c2 = executor.inputReadRaw(locals, locals.getNextIndex()); + if (executor.inputUTF16IsLowSurrogate(c2)) { + locals.setNextIndex(executor.inputIncRaw(locals.getNextIndex())); + if (astral != null) { + c = executor.inputUTF16ToCodePoint(c, c2); + for (int i = 0; i < astral.length; i++) { + if (match(astral, i, c, compactString)) { + return checkMatchSuccessorFoundHook(locals, executor, loopMode, preLoopIndex, i); + } + } + } + return checkMatchNoMatch(locals, executor, loopMode, preLoopIndex); + } + } else if (latin1 != null && (bmp == null || compactString || executor.getInputProfile().profile(c < 256))) { + for (int i = 0; i < latin1.length; i++) { + if (match(latin1, i, c, compactString)) { + return checkMatchSuccessorFoundHook(locals, executor, loopMode, preLoopIndex, i); + } } } - locals.setSuccessorIndex(FS_RESULT_NO_SUCCESSOR); + if (bmp != null) { + for (int i = 0; i < bmp.length; i++) { + if (match(bmp, i, c, compactString)) { + return checkMatchSuccessorFoundHook(locals, executor, loopMode, preLoopIndex, i); + } + } + } + } + return checkMatchNoMatch(locals, executor, loopMode, preLoopIndex); + } + + private boolean checkMatchNoMatch(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, boolean loopMode, int preLoopIndex) { + if (matchers.getNoMatchSuccessor() == -1) { + if (loopMode) { + noSuccessorLoop(locals, executor, preLoopIndex); + } + locals.setSuccessorIndex(-1); return false; + } else { + return checkMatchSuccessorFoundHook(locals, executor, loopMode, preLoopIndex, matchers.getNoMatchSuccessor()); } } + private static boolean match(CharMatcher[] matchers, int i, final int c, boolean compactString) { + return matchers[i] != null && matchers[i].execute(c, compactString); + } + + private boolean checkMatchSuccessorFoundHook(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, boolean loopMode, int preLoopIndex, int i) { + CompilerAsserts.partialEvaluationConstant(i); + locals.setSuccessorIndex(i); + if (loopMode) { + successorFoundLoop(locals, executor, i, preLoopIndex); + } else { + successorFound(locals, executor, i); + } + return isLoopToSelf(i); + } + private void checkFinalState(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor) { CompilerAsserts.partialEvaluationConstant(this); if (isFinalState()) { @@ -360,6 +605,20 @@ void successorFound(TRegexDFAExecutorLocals locals, @SuppressWarnings("unused") } } + /** + * Hook for {@link CGTrackingDFAStateNode}. + */ + @SuppressWarnings("unused") + void successorFoundLoop(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, int i, int preLoopIndex) { + } + + /** + * Hook for {@link CGTrackingDFAStateNode}. + */ + @SuppressWarnings("unused") + void noSuccessorLoop(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, int preLoopIndex) { + } + void storeResult(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, @SuppressWarnings("unused") boolean anchored) { CompilerAsserts.partialEvaluationConstant(this); if (executor.isSimpleCG()) { @@ -390,7 +649,7 @@ public String toString() { StringBuilder sb = new StringBuilder(); DebugUtil.appendNodeId(sb, getId()).append(": "); if (!treeTransitionMatching()) { - sb.append(matchers.length).append(" successors"); + sb.append(matchers.size()).append(" successors"); } if (isAnchoredFinalState()) { sb.append(", AFS"); @@ -402,8 +661,8 @@ public String toString() { if (treeTransitionMatching()) { sb.append(" ").append(getTreeMatcher()).append("\n successors: ").append(Arrays.toString(successors)).append("\n"); } else { - for (int i = 0; i < matchers.length; i++) { - sb.append(" ").append(i).append(": ").append(matchers[i]).append(" -> "); + for (int i = 0; i < matchers.size(); i++) { + sb.append(" ").append(i).append(": ").append(matchers.toString(i)).append(" -> "); DebugUtil.appendNodeId(sb, getSuccessors()[i]).append("\n"); } } @@ -415,8 +674,8 @@ public String toString() { public JsonValue toJson() { JsonArray transitions = Json.array(); if (matchers != null) { - for (int i = 0; i < matchers.length; i++) { - transitions.append(Json.obj(Json.prop("matcher", matchers[i].toString()), Json.prop("target", successors[i]))); + for (int i = 0; i < matchers.size(); i++) { + transitions.append(Json.obj(Json.prop("matcher", matchers.toString(i)), Json.prop("target", successors[i]))); } } return Json.obj(Json.prop("id", getId()), diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/Matchers.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/Matchers.java new file mode 100644 index 000000000000..b307e79321ee --- /dev/null +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/Matchers.java @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2020, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * The Universal Permissive License (UPL), Version 1.0 + * + * Subject to the condition set forth below, permission is hereby granted to any + * person obtaining a copy of this software, associated documentation and/or + * data (collectively the "Software"), free of charge and under any and all + * copyright rights in the Software, and any and all patent rights owned or + * freely licensable by each licensor hereunder covering either (i) the + * unmodified Software as contributed to or provided by such licensor, or (ii) + * the Larger Works (as defined below), to deal in both + * + * (a) the Software, and + * + * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if + * one is included with the Software each a "Larger Work" to which the Software + * is contributed by such licensors), + * + * without restriction, including without limitation the rights to copy, create + * derivative works of, display, perform, and distribute the Software and make, + * use, sell, offer for sale, import, export, have made, and have sold the + * Software and the Larger Work(s), and to sublicense the foregoing rights on + * either these or other terms. + * + * This license is subject to the following condition: + * + * The above copyright notice and either this complete permission notice or at a + * minimum a reference to the UPL must be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package com.oracle.truffle.regex.tregex.nodes.dfa; + +import java.util.Objects; + +import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; +import com.oracle.truffle.api.nodes.Node; +import com.oracle.truffle.regex.charset.CharMatchers; +import com.oracle.truffle.regex.charset.CodePointSet; +import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; +import com.oracle.truffle.regex.tregex.buffer.ObjectArrayBuffer; +import com.oracle.truffle.regex.tregex.matchers.CharMatcher; + +/** + * Container for character matchers of DFA transitions, potentially specialized for a given string + * encoding. + */ +public abstract class Matchers extends Node { + + private final short noMatchSuccessor; + + Matchers(short noMatchSuccessor) { + this.noMatchSuccessor = noMatchSuccessor; + } + + public short getNoMatchSuccessor() { + return noMatchSuccessor; + } + + /** + * Returns the number of transitions represented by this object. + */ + public abstract int size(); + + private static int size(CharMatcher[]... matchersArr) { + for (CharMatcher[] matchers : matchersArr) { + if (matchers != null) { + return matchers.length; + } + } + return 0; + } + + /** + * Returns {@code true} iff transition {@code i} matches {@code c}. + */ + public abstract boolean match(int i, int c, boolean compactString); + + /** + * Returns the index of the transition that matches the given character {@code c}, or + * {@code noMatchSuccessor}. For debugging purposes. + */ + public int match(int c, boolean compactString) { + for (int i = 0; i < size(); i++) { + if (match(i, c, compactString)) { + return i; + } + } + return noMatchSuccessor; + } + + /** + * Returns a String representation of transition {@code i}. + */ + public abstract String toString(int i); + + private static boolean match(CharMatcher[] matchers, int i, int c, boolean compactString) { + return matchers != null && matchers[i] != null && matchers[i].execute(c, compactString); + } + + @TruffleBoundary + private static String toString(CharMatcher[] matchers, int i) { + return matchers == null ? "" : Objects.toString(matchers[i]); + } + + public static final class SimpleMatchers extends Matchers { + + @Children private final CharMatcher[] matchers; + + public SimpleMatchers(CharMatcher[] matchers, short noMatchSuccessor) { + super(noMatchSuccessor); + this.matchers = matchers; + } + + public CharMatcher[] getMatchers() { + return matchers; + } + + @Override + public int size() { + return matchers.length; + } + + @Override + public boolean match(int i, int c, boolean compactString) { + return match(matchers, i, c, compactString); + } + + @TruffleBoundary + @Override + public String toString(int i) { + return matchers[i].toString(); + } + } + + public static final class UTF16RawMatchers extends Matchers { + + @Children private final CharMatcher[] latin1; + @Children private final CharMatcher[] bmp; + + public UTF16RawMatchers(CharMatcher[] latin1, CharMatcher[] bmp, short noMatchSuccessor) { + super(noMatchSuccessor); + this.latin1 = latin1; + this.bmp = bmp; + } + + public CharMatcher[] getLatin1() { + return latin1; + } + + public CharMatcher[] getBmp() { + return bmp; + } + + @Override + public int size() { + return size(latin1, bmp); + } + + @Override + public boolean match(int i, int c, boolean compactString) { + return match(latin1, i, c, compactString) || match(bmp, i, c, compactString); + } + + @TruffleBoundary + @Override + public String toString(int i) { + return toString(latin1, i) + toString(bmp, i); + } + } + + public static final class UTF16Matchers extends Matchers { + + @Children private final CharMatcher[] latin1; + @Children private final CharMatcher[] bmp; + @Children private final CharMatcher[] astral; + + public UTF16Matchers(CharMatcher[] latin1, CharMatcher[] bmp, CharMatcher[] astral, short noMatchSuccessor) { + super(noMatchSuccessor); + this.latin1 = latin1; + this.bmp = bmp; + this.astral = astral; + } + + public CharMatcher[] getLatin1() { + return latin1; + } + + public CharMatcher[] getBmp() { + return bmp; + } + + public CharMatcher[] getAstral() { + return astral; + } + + @Override + public int size() { + return size(bmp, astral); + } + + @Override + public boolean match(int i, int c, boolean compactString) { + return match(bmp, i, c, compactString) || match(astral, i, c, compactString); + } + + @TruffleBoundary + @Override + public String toString(int i) { + return toString(bmp, i) + toString(astral, i); + } + } + + public static final class UTF8Matchers extends Matchers { + + @Children private final CharMatcher[] ascii; + @Children private final CharMatcher[] enc2; + @Children private final CharMatcher[] enc3; + @Children private final CharMatcher[] enc4; + + public UTF8Matchers(CharMatcher[] ascii, CharMatcher[] enc2, CharMatcher[] enc3, CharMatcher[] enc4, short noMatchSuccessor) { + super(noMatchSuccessor); + this.ascii = ascii; + this.enc2 = enc2; + this.enc3 = enc3; + this.enc4 = enc4; + } + + public CharMatcher[] getAscii() { + return ascii; + } + + public CharMatcher[] getEnc2() { + return enc2; + } + + public CharMatcher[] getEnc3() { + return enc3; + } + + public CharMatcher[] getEnc4() { + return enc4; + } + + @Override + public int size() { + return size(ascii, enc2, enc3, enc4); + } + + @Override + public boolean match(int i, int c, boolean compactString) { + return match(ascii, i, c, compactString) || match(enc2, i, c, compactString) || match(enc3, i, c, compactString) || match(enc4, i, c, compactString); + } + + @TruffleBoundary + @Override + public String toString(int i) { + return toString(ascii, i) + toString(enc2, i) + toString(enc3, i) + toString(enc4, i); + } + } + + public static final class Builder { + + private final ObjectArrayBuffer[] buffers; + private short noMatchSuccessor = -1; + + @SuppressWarnings("unchecked") + public Builder(int nBuffers) { + buffers = new ObjectArrayBuffer[nBuffers]; + for (int i = 0; i < buffers.length; i++) { + buffers[i] = new ObjectArrayBuffer<>(); + } + } + + public void reset(int nTransitions) { + for (ObjectArrayBuffer buf : buffers) { + buf.asFixedSizeArray(nTransitions); + } + noMatchSuccessor = -1; + } + + public ObjectArrayBuffer getBuffer(int i) { + return buffers[i]; + } + + public short getNoMatchSuccessor() { + return noMatchSuccessor; + } + + public void setNoMatchSuccessor(short noMatchSuccessor) { + this.noMatchSuccessor = noMatchSuccessor; + } + + public int estimatedCost(int i) { + int ret = 0; + for (ObjectArrayBuffer buf : buffers) { + if (buf != null && buf.get(i) != null) { + ret = Math.max(ret, buf.get(i).estimatedCost()); + } + } + return ret; + } + + public void createSplitMatcher(int i, CodePointSet cps, CompilationBuffer compilationBuffer, CodePointSet... splitRanges) { + for (int j = 0; j < splitRanges.length; j++) { + CodePointSet intersection = splitRanges[j].createIntersection(cps, compilationBuffer); + if (intersection.matchesSomething()) { + buffers[j].set(i, CharMatchers.createMatcher(intersection, compilationBuffer)); + } + } + } + + public CharMatcher[] materialize(int buf) { + return isEmpty(buffers[buf]) ? null : buffers[buf].toArray(new CharMatcher[buffers[buf].length()]); + } + + private static boolean isEmpty(ObjectArrayBuffer buf) { + for (CharMatcher m : buf) { + if (m != null) { + return false; + } + } + return true; + } + } +} diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/TraceFinderDFAStateNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/TraceFinderDFAStateNode.java index 62d538f2743e..35230fd84f14 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/TraceFinderDFAStateNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/TraceFinderDFAStateNode.java @@ -40,8 +40,6 @@ */ package com.oracle.truffle.regex.tregex.nodes.dfa; -import com.oracle.truffle.regex.tregex.matchers.CharMatcher; - public class TraceFinderDFAStateNode extends BackwardDFAStateNode { public static final byte NO_PRE_CALC_RESULT = (byte) 0xff; @@ -49,9 +47,9 @@ public class TraceFinderDFAStateNode extends BackwardDFAStateNode { private final byte preCalculatedUnAnchoredResult; private final byte preCalculatedAnchoredResult; - public TraceFinderDFAStateNode(short id, byte flags, LoopOptimizationNode loopOptimizationNode, short[] successors, CharMatcher[] matchers, + public TraceFinderDFAStateNode(short id, byte flags, short loopTransitionIndex, LoopOptimizationNode loopOptimizationNode, short[] successors, Matchers matchers, AllTransitionsInOneTreeMatcher allTransitionsInOneTreeMatcher, byte preCalculatedUnAnchoredResult, byte preCalculatedAnchoredResult) { - super(id, flags, loopOptimizationNode, successors, matchers, null, allTransitionsInOneTreeMatcher); + super(id, flags, loopTransitionIndex, loopOptimizationNode, successors, matchers, null, allTransitionsInOneTreeMatcher); this.preCalculatedUnAnchoredResult = preCalculatedUnAnchoredResult; this.preCalculatedAnchoredResult = initPreCalculatedAnchoredResult(preCalculatedUnAnchoredResult, preCalculatedAnchoredResult); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputEndsWithNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputEndsWithNode.java index 872ff78cc0ed..cfd7f4dbe8f6 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputEndsWithNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputEndsWithNode.java @@ -45,7 +45,6 @@ import com.oracle.truffle.api.dsl.Specialization; import com.oracle.truffle.api.interop.TruffleObject; import com.oracle.truffle.api.nodes.Node; -import com.oracle.truffle.regex.tregex.string.StringUTF16; public abstract class InputEndsWithNode extends Node { @@ -56,37 +55,76 @@ public static InputEndsWithNode create() { public abstract boolean execute(Object input, Object suffix, Object mask); @Specialization(guards = "mask == null") - public boolean endsWith(String input, StringUTF16 suffix, @SuppressWarnings("unused") Object mask) { - return input.endsWith(suffix.toString()); + public boolean doBytes(byte[] input, byte[] suffix, @SuppressWarnings("unused") Object mask) { + return ArrayUtils.regionEqualsWithOrMask(input, input.length - suffix.length, suffix, 0, suffix.length, null); } @Specialization(guards = "mask != null") - public boolean endsWithWithMask(String input, StringUTF16 suffix, StringUTF16 mask) { - return ArrayUtils.regionEqualsWithOrMask(input, input.length() - suffix.encodedLength(), suffix.toString(), 0, mask.encodedLength(), mask.toString()); + public boolean doBytesMask(byte[] input, byte[] suffix, byte[] mask) { + return ArrayUtils.regionEqualsWithOrMask(input, input.length - suffix.length, suffix, 0, mask.length, mask); } @Specialization(guards = "mask == null") - public boolean endsWithTruffleObjNoMask(TruffleObject input, StringUTF16 suffix, @SuppressWarnings("unused") Object mask, - @Cached("create()") InputLengthNode lengthNode, - @Cached("create()") InputReadNode charAtNode) { + public boolean doString(String input, String suffix, @SuppressWarnings("unused") Object mask) { + return input.endsWith(suffix); + } + + @Specialization(guards = "mask != null") + public boolean doStringMask(String input, String suffix, String mask) { + return ArrayUtils.regionEqualsWithOrMask(input, input.length() - suffix.length(), suffix, 0, mask.length(), mask); + } + + @Specialization(guards = "mask == null") + public boolean doTruffleObjBytes(TruffleObject input, byte[] suffix, @SuppressWarnings("unused") Object mask, + @Cached InputLengthNode lengthNode, + @Cached InputReadNode charAtNode) { return endsWithTruffleObj(input, suffix, null, lengthNode, charAtNode); } @Specialization(guards = "mask != null") - public boolean endsWithTruffleObjWithMask(TruffleObject input, StringUTF16 suffix, StringUTF16 mask, - @Cached("create()") InputLengthNode lengthNode, - @Cached("create()") InputReadNode charAtNode) { - assert mask.encodedLength() == suffix.encodedLength(); + public boolean doTruffleObjBytesMask(TruffleObject input, byte[] suffix, byte[] mask, + @Cached InputLengthNode lengthNode, + @Cached InputReadNode charAtNode) { + assert mask.length == suffix.length; return endsWithTruffleObj(input, suffix, mask, lengthNode, charAtNode); } - private static boolean endsWithTruffleObj(TruffleObject input, StringUTF16 suffix, StringUTF16 mask, InputLengthNode lengthNode, InputReadNode charAtNode) { + @Specialization(guards = "mask == null") + public boolean doTruffleObjString(TruffleObject input, String suffix, @SuppressWarnings("unused") Object mask, + @Cached InputLengthNode lengthNode, + @Cached InputReadNode charAtNode) { + return endsWithTruffleObj(input, suffix, null, lengthNode, charAtNode); + } + + @Specialization(guards = "mask != null") + public boolean doTruffleObjStringMask(TruffleObject input, String suffix, String mask, + @Cached InputLengthNode lengthNode, + @Cached InputReadNode charAtNode) { + assert mask.length() == suffix.length(); + return endsWithTruffleObj(input, suffix, mask, lengthNode, charAtNode); + } + + private static boolean endsWithTruffleObj(TruffleObject input, byte[] suffix, byte[] mask, InputLengthNode lengthNode, InputReadNode charAtNode) { + final int inputLength = lengthNode.execute(input); + if (inputLength < suffix.length) { + return false; + } + final int offset = inputLength - suffix.length; + for (int i = 0; i < suffix.length; i++) { + if (InputReadNode.readWithMask(input, offset + i, mask, i, charAtNode) != Byte.toUnsignedInt(suffix[i])) { + return false; + } + } + return true; + } + + private static boolean endsWithTruffleObj(TruffleObject input, String suffix, String mask, InputLengthNode lengthNode, InputReadNode charAtNode) { final int inputLength = lengthNode.execute(input); - if (inputLength < suffix.encodedLength()) { + if (inputLength < suffix.length()) { return false; } - final int offset = inputLength - suffix.encodedLength(); - for (int i = 0; i < suffix.encodedLength(); i++) { + final int offset = inputLength - suffix.length(); + for (int i = 0; i < suffix.length(); i++) { if (InputReadNode.readWithMask(input, offset + i, mask, i, charAtNode) != suffix.charAt(i)) { return false; } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputEqualsNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputEqualsNode.java index e7f5862e4d48..b24190e22dda 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputEqualsNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputEqualsNode.java @@ -40,12 +40,13 @@ */ package com.oracle.truffle.regex.tregex.nodes.input; +import java.util.Arrays; + import com.oracle.truffle.api.ArrayUtils; import com.oracle.truffle.api.dsl.Cached; import com.oracle.truffle.api.dsl.Specialization; import com.oracle.truffle.api.interop.TruffleObject; import com.oracle.truffle.api.nodes.Node; -import com.oracle.truffle.regex.tregex.string.StringUTF16; public abstract class InputEqualsNode extends Node { @@ -56,38 +57,74 @@ public static InputEqualsNode create() { public abstract boolean execute(Object input, Object string, Object mask); @Specialization(guards = "mask == null") - public boolean execEquals(String input, StringUTF16 string, @SuppressWarnings("unused") Object mask) { - return input.equals(string.toString()); + public boolean doBytes(byte[] input, byte[] string, @SuppressWarnings("unused") Object mask) { + return Arrays.equals(input, string); + } + + @Specialization(guards = "mask != null") + public boolean doBytesMask(byte[] input, byte[] string, byte[] mask) { + return input.length == string.length && ArrayUtils.regionEqualsWithOrMask(input, 0, string, 0, mask.length, mask); + } + + @Specialization(guards = "mask == null") + public boolean doString(String input, String string, @SuppressWarnings("unused") Object mask) { + return input.equals(string); } @Specialization(guards = "mask != null") - public boolean execEqualsWithMask(String input, StringUTF16 string, StringUTF16 mask) { - return input.length() == string.encodedLength() && ArrayUtils.regionEqualsWithOrMask(input, 0, string.toString(), 0, mask.encodedLength(), mask.toString()); + public boolean doStringMask(String input, String string, String mask) { + return input.length() == string.length() && ArrayUtils.regionEqualsWithOrMask(input, 0, string, 0, mask.length(), mask); } @Specialization(guards = "mask == null") - public boolean equalsTruffleObjNoMask(TruffleObject input, StringUTF16 string, @SuppressWarnings("unused") Object mask, - @Cached("create()") InputLengthNode lengthNode, - @Cached("create()") InputReadNode charAtNode) { + public boolean doTruffleObjBytes(TruffleObject input, byte[] string, @SuppressWarnings("unused") Object mask, + @Cached InputLengthNode lengthNode, + @Cached InputReadNode charAtNode) { return equalsTruffleObj(input, string, null, lengthNode, charAtNode); } @Specialization(guards = "mask != null") - public boolean equalsTruffleObjWithMask(TruffleObject input, StringUTF16 string, StringUTF16 mask, - @Cached("create()") InputLengthNode lengthNode, - @Cached("create()") InputReadNode charAtNode) { + public boolean doTruffleObjBytesMask(TruffleObject input, byte[] string, byte[] mask, + @Cached InputLengthNode lengthNode, + @Cached InputReadNode charAtNode) { return equalsTruffleObj(input, string, mask, lengthNode, charAtNode); } - private static boolean equalsTruffleObj(TruffleObject input, StringUTF16 string, StringUTF16 mask, InputLengthNode lengthNode, InputReadNode charAtNode) { - if (lengthNode.execute(input) != string.encodedLength()) { + @Specialization(guards = "mask == null") + public boolean doTruffleObjString(TruffleObject input, String string, @SuppressWarnings("unused") Object mask, + @Cached InputLengthNode lengthNode, + @Cached InputReadNode charAtNode) { + return equalsTruffleObj(input, string, null, lengthNode, charAtNode); + } + + @Specialization(guards = "mask != null") + public boolean doTruffleObjStringMask(TruffleObject input, String string, String mask, + @Cached InputLengthNode lengthNode, + @Cached InputReadNode charAtNode) { + return equalsTruffleObj(input, string, mask, lengthNode, charAtNode); + } + + private static boolean equalsTruffleObj(TruffleObject input, String string, String mask, InputLengthNode lengthNode, InputReadNode charAtNode) { + if (lengthNode.execute(input) != string.length()) { return false; } - for (int i = 0; i < string.encodedLength(); i++) { + for (int i = 0; i < string.length(); i++) { if (InputReadNode.readWithMask(input, i, mask, i, charAtNode) != string.charAt(i)) { return false; } } return true; } + + private static boolean equalsTruffleObj(TruffleObject input, byte[] string, byte[] mask, InputLengthNode lengthNode, InputReadNode charAtNode) { + if (lengthNode.execute(input) != string.length) { + return false; + } + for (int i = 0; i < string.length; i++) { + if (InputReadNode.readWithMask(input, i, mask, i, charAtNode) != Byte.toUnsignedInt(string[i])) { + return false; + } + } + return true; + } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputIndexOfNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputIndexOfNode.java index dd429b662dd3..2c92c4340743 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputIndexOfNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputIndexOfNode.java @@ -52,20 +52,25 @@ public static InputIndexOfNode create() { return InputIndexOfNodeGen.create(); } - public abstract int execute(Object input, int fromIndex, int maxIndex, char[] chars); + public abstract int execute(Object input, int fromIndex, int maxIndex, Object chars); @Specialization - public int indexOf(String input, int fromIndex, int maxIndex, char[] chars) { + public int doBytes(byte[] input, int fromIndex, int maxIndex, byte[] bytes) { + return ArrayUtils.indexOf(input, fromIndex, maxIndex, bytes); + } + + @Specialization + public int doChars(String input, int fromIndex, int maxIndex, char[] chars) { return ArrayUtils.indexOf(input, fromIndex, maxIndex, chars); } @Specialization - public int indexOf(TruffleObject input, int fromIndex, int maxIndex, char[] chars, - @Cached("create()") InputReadNode charAtNode) { + public int doTruffleObjBytes(TruffleObject input, int fromIndex, int maxIndex, byte[] bytes, + @Cached InputReadNode charAtNode) { for (int i = fromIndex; i < maxIndex; i++) { int c = charAtNode.execute(input, i); - for (char v : chars) { - if (c == v) { + for (byte v : bytes) { + if (c == Byte.toUnsignedInt(v)) { return i; } } @@ -73,7 +78,17 @@ public int indexOf(TruffleObject input, int fromIndex, int maxIndex, char[] char return -1; } - static boolean maskIsZero(char mask) { - return mask == 0; + @Specialization + public int doTruffleObjChars(TruffleObject input, int fromIndex, int maxIndex, char[] chars, + @Cached InputReadNode charAtNode) { + for (int i = fromIndex; i < maxIndex; i++) { + int c = charAtNode.execute(input, i); + for (char v : chars) { + if (c == v) { + return i; + } + } + } + return -1; } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputIndexOfStringNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputIndexOfStringNode.java index bb833cd91afd..db1214a3bfd2 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputIndexOfStringNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputIndexOfStringNode.java @@ -47,7 +47,6 @@ import com.oracle.truffle.api.interop.TruffleObject; import com.oracle.truffle.api.nodes.Node; import com.oracle.truffle.regex.RegexRootNode; -import com.oracle.truffle.regex.tregex.string.StringUTF16; public abstract class InputIndexOfStringNode extends Node { @@ -58,31 +57,62 @@ public static InputIndexOfStringNode create() { public abstract int execute(Object input, int fromIndex, int maxIndex, Object match, Object mask); @Specialization(guards = "mask == null") - public int doString(String input, int fromIndex, int maxIndex, StringUTF16 match, @SuppressWarnings("unused") Object mask) { - int result = input.indexOf(match.toString(), fromIndex); + public int doBytes(byte[] input, int fromIndex, int maxIndex, byte[] match, @SuppressWarnings("unused") Object mask) { + return ArrayUtils.indexOfWithOrMask(input, fromIndex, maxIndex - fromIndex, match, null); + } + + @Specialization(guards = "mask != null") + public int doBytesMask(byte[] input, int fromIndex, int maxIndex, byte[] match, byte[] mask) { + return ArrayUtils.indexOfWithOrMask(input, fromIndex, maxIndex - fromIndex, match, mask); + } + + @Specialization(guards = "mask == null") + public int doString(String input, int fromIndex, int maxIndex, String match, @SuppressWarnings("unused") Object mask) { + int result = input.indexOf(match, fromIndex); return result >= maxIndex ? -1 : result; } @Specialization(guards = "mask != null") - public int doStringWithMask(String input, int fromIndex, int maxIndex, StringUTF16 match, StringUTF16 mask) { - return ArrayUtils.indexOfWithOrMask(input, fromIndex, maxIndex - fromIndex, match.toString(), mask.toString()); + public int doStringMask(String input, int fromIndex, int maxIndex, String match, String mask) { + return ArrayUtils.indexOfWithOrMask(input, fromIndex, maxIndex - fromIndex, match, mask); + } + + @Specialization + public int doTruffleObjBytes(TruffleObject input, int fromIndex, int maxIndex, byte[] match, Object mask, + @Cached InputLengthNode lengthNode, + @Cached InputRegionMatchesNode regionMatchesNode) { + if (maxIndex > lengthNode.execute(input)) { + return -1; + } + if (fromIndex + match.length > maxIndex) { + return -1; + } + for (int i = fromIndex; i <= maxIndex - match.length; i++) { + if (CompilerDirectives.inInterpreter()) { + RegexRootNode.checkThreadInterrupted(); + } + if (regionMatchesNode.execute(input, i, match, 0, match.length, mask)) { + return i; + } + } + return -1; } @Specialization - public int doTruffleObject(TruffleObject input, int fromIndex, int maxIndex, StringUTF16 match, Object mask, - @Cached("create()") InputLengthNode lengthNode, - @Cached("create()") InputRegionMatchesNode regionMatchesNode) { + public int doTruffleObjString(TruffleObject input, int fromIndex, int maxIndex, String match, Object mask, + @Cached InputLengthNode lengthNode, + @Cached InputRegionMatchesNode regionMatchesNode) { if (maxIndex > lengthNode.execute(input)) { return -1; } - if (fromIndex + match.encodedLength() > maxIndex) { + if (fromIndex + match.length() > maxIndex) { return -1; } - for (int i = fromIndex; i <= maxIndex - match.encodedLength(); i++) { + for (int i = fromIndex; i <= maxIndex - match.length(); i++) { if (CompilerDirectives.inInterpreter()) { RegexRootNode.checkThreadInterrupted(); } - if (regionMatchesNode.execute(input, i, match, 0, match.encodedLength(), mask)) { + if (regionMatchesNode.execute(input, i, match, 0, match.length(), mask)) { return i; } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputLengthNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputLengthNode.java index 38cdc2aad691..d11385a28249 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputLengthNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputLengthNode.java @@ -58,12 +58,17 @@ public static InputLengthNode create() { public abstract int execute(Object input); @Specialization - static int getLength(String input) { + static int doBytes(byte[] input) { + return input.length; + } + + @Specialization + static int doString(String input) { return input.length(); } @Specialization(guards = "inputs.hasArrayElements(input)", limit = "2") - static int doBoxedCharArray(Object input, + static int doTruffleObj(Object input, @CachedLibrary("input") InteropLibrary inputs) { try { long length = inputs.getArraySize(input); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputReadNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputReadNode.java index d3d0b8538287..bf3b6e664549 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputReadNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputReadNode.java @@ -53,7 +53,6 @@ import com.oracle.truffle.api.library.CachedLibrary; import com.oracle.truffle.api.nodes.Node; import com.oracle.truffle.regex.runtime.nodes.ToCharNode; -import com.oracle.truffle.regex.tregex.string.StringUTF16; @GenerateUncached public abstract class InputReadNode extends Node { @@ -64,6 +63,11 @@ public static InputReadNode create() { public abstract int execute(Object input, int index); + @Specialization + static int doBytes(byte[] input, int index) { + return Byte.toUnsignedInt(input[index]); + } + @Specialization static int doString(String input, int index) { return input.charAt(index); @@ -82,9 +86,15 @@ static int doBoxedCharArray(Object input, int index, } } - public static int readWithMask(TruffleObject input, int indexInput, StringUTF16 mask, int indexMask, InputReadNode charAtNode) { + public static int readWithMask(TruffleObject input, int indexInput, String mask, int indexMask, InputReadNode charAtNode) { CompilerAsserts.partialEvaluationConstant(mask == null); int c = charAtNode.execute(input, indexInput); return (mask == null ? c : (c | mask.charAt(indexMask))); } + + public static int readWithMask(TruffleObject input, int indexInput, byte[] mask, int indexMask, InputReadNode charAtNode) { + CompilerAsserts.partialEvaluationConstant(mask == null); + int c = charAtNode.execute(input, indexInput); + return (mask == null ? c : (c | Byte.toUnsignedInt(mask[indexMask]))); + } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputRegionMatchesNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputRegionMatchesNode.java index 770328fd0a59..1244648302ce 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputRegionMatchesNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputRegionMatchesNode.java @@ -45,7 +45,6 @@ import com.oracle.truffle.api.dsl.Specialization; import com.oracle.truffle.api.interop.TruffleObject; import com.oracle.truffle.api.nodes.Node; -import com.oracle.truffle.regex.tregex.string.StringUTF16; public abstract class InputRegionMatchesNode extends Node { @@ -56,41 +55,61 @@ public static InputRegionMatchesNode create() { public abstract boolean execute(Object input, int fromIndex1, Object match, int fromIndex2, int length, Object mask); @Specialization(guards = "mask == null") - public boolean regionMatchesJavaString(String input, int fromIndex1, String match, int fromIndex2, int length, @SuppressWarnings("unused") Object mask) { + public boolean doBytes(byte[] input, int fromIndex1, byte[] match, int fromIndex2, int length, @SuppressWarnings("unused") Object mask) { + return ArrayUtils.regionEqualsWithOrMask(input, fromIndex1, match, fromIndex2, length, null); + } + + @Specialization(guards = "mask != null") + public boolean doBytesMask(byte[] input, int fromIndex1, byte[] match, int fromIndex2, int length, byte[] mask) { + return ArrayUtils.regionEqualsWithOrMask(input, fromIndex1, match, fromIndex2, length, mask); + } + + @Specialization(guards = "mask == null") + public boolean doString(String input, int fromIndex1, String match, int fromIndex2, int length, @SuppressWarnings("unused") Object mask) { return input.regionMatches(fromIndex1, match, fromIndex2, length); } + @Specialization(guards = "mask != null") + public boolean doJavaStringMask(String input, int fromIndex1, String match, int fromIndex2, int length, String mask) { + return ArrayUtils.regionEqualsWithOrMask(input, fromIndex1, match, fromIndex2, length, mask); + } + @Specialization(guards = "mask == null") - public boolean regionMatches(String input, int fromIndex1, StringUTF16 match, int fromIndex2, int length, @SuppressWarnings("unused") Object mask) { - return input.regionMatches(fromIndex1, match.toString(), fromIndex2, length); + public boolean doTruffleObjBytes(TruffleObject input, int fromIndex1, byte[] match, int fromIndex2, int length, @SuppressWarnings("unused") Object mask, + @Cached InputLengthNode lengthNode, + @Cached InputReadNode charAtNode) { + return regionMatchesTruffleObj(input, fromIndex1, match, fromIndex2, length, null, lengthNode, charAtNode); } @Specialization(guards = "mask != null") - public boolean regionMatchesWithMask(String input, int fromIndex1, StringUTF16 match, int fromIndex2, int length, StringUTF16 mask) { - return ArrayUtils.regionEqualsWithOrMask(input, fromIndex1, match.toString(), fromIndex2, length, mask.toString()); + public boolean doTruffleObjBytesMask(TruffleObject input, int fromIndex1, byte[] match, int fromIndex2, int length, byte[] mask, + @Cached InputLengthNode lengthNode, + @Cached InputReadNode charAtNode) { + assert match.length == mask.length; + return regionMatchesTruffleObj(input, fromIndex1, match, fromIndex2, length, mask, lengthNode, charAtNode); } @Specialization(guards = "mask == null") - public boolean regionMatchesTruffleObjNoMask(TruffleObject input, int fromIndex1, StringUTF16 match, int fromIndex2, int length, @SuppressWarnings("unused") Object mask, - @Cached("create()") InputLengthNode lengthNode, - @Cached("create()") InputReadNode charAtNode) { + public boolean doTruffleObjString(TruffleObject input, int fromIndex1, String match, int fromIndex2, int length, @SuppressWarnings("unused") Object mask, + @Cached InputLengthNode lengthNode, + @Cached InputReadNode charAtNode) { return regionMatchesTruffleObj(input, fromIndex1, match, fromIndex2, length, null, lengthNode, charAtNode); } @Specialization(guards = "mask != null") - public boolean regionMatchesTruffleObjWithMask(TruffleObject input, int fromIndex1, StringUTF16 match, int fromIndex2, int length, StringUTF16 mask, - @Cached("create()") InputLengthNode lengthNode, - @Cached("create()") InputReadNode charAtNode) { - assert match.encodedLength() == mask.encodedLength(); + public boolean doTruffleObjStringMask(TruffleObject input, int fromIndex1, String match, int fromIndex2, int length, String mask, + @Cached InputLengthNode lengthNode, + @Cached InputReadNode charAtNode) { + assert match.length() == mask.length(); return regionMatchesTruffleObj(input, fromIndex1, match, fromIndex2, length, mask, lengthNode, charAtNode); } @Specialization(guards = "mask == null") - public boolean regionMatchesTruffleObjTruffleObjNoMask(TruffleObject input, int fromIndex1, TruffleObject match, int fromIndex2, int length, @SuppressWarnings("unused") Object mask, - @Cached("create()") InputLengthNode lengthNode1, - @Cached("create()") InputReadNode charAtNode1, - @Cached("create()") InputLengthNode lengthNode2, - @Cached("create()") InputReadNode charAtNode2) { + public boolean doTruffleObjTruffleObj(TruffleObject input, int fromIndex1, TruffleObject match, int fromIndex2, int length, @SuppressWarnings("unused") Object mask, + @Cached InputLengthNode lengthNode1, + @Cached InputReadNode charAtNode1, + @Cached InputLengthNode lengthNode2, + @Cached InputReadNode charAtNode2) { if (fromIndex1 + length > lengthNode1.execute(input) || fromIndex2 + length > lengthNode2.execute(match)) { return false; } @@ -102,10 +121,24 @@ public boolean regionMatchesTruffleObjTruffleObjNoMask(TruffleObject input, int return true; } - private static boolean regionMatchesTruffleObj(TruffleObject input, int fromIndex1, StringUTF16 match, int fromIndex2, int length, StringUTF16 mask, + private static boolean regionMatchesTruffleObj(TruffleObject input, int fromIndex1, byte[] match, int fromIndex2, int length, byte[] mask, + InputLengthNode lengthNode, + InputReadNode charAtNode) { + if (fromIndex1 + length > lengthNode.execute(input) || fromIndex2 + length > match.length) { + return false; + } + for (int i = 0; i < length; i++) { + if (InputReadNode.readWithMask(input, fromIndex1 + i, mask, i, charAtNode) != Byte.toUnsignedInt(match[fromIndex2 + i])) { + return false; + } + } + return true; + } + + private static boolean regionMatchesTruffleObj(TruffleObject input, int fromIndex1, String match, int fromIndex2, int length, String mask, InputLengthNode lengthNode, InputReadNode charAtNode) { - if (fromIndex1 + length > lengthNode.execute(input) || fromIndex2 + length > match.encodedLength()) { + if (fromIndex1 + length > lengthNode.execute(input) || fromIndex2 + length > match.length()) { return false; } for (int i = 0; i < length; i++) { diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputStartsWithNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputStartsWithNode.java index bb39417fec72..d4610dd7a71d 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputStartsWithNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputStartsWithNode.java @@ -45,7 +45,6 @@ import com.oracle.truffle.api.dsl.Specialization; import com.oracle.truffle.api.interop.TruffleObject; import com.oracle.truffle.api.nodes.Node; -import com.oracle.truffle.regex.tregex.string.StringUTF16; public abstract class InputStartsWithNode extends Node { @@ -56,35 +55,72 @@ public static InputStartsWithNode create() { public abstract boolean execute(Object input, Object prefix, Object mask); @Specialization(guards = "mask == null") - public boolean startsWith(String input, StringUTF16 prefix, @SuppressWarnings("unused") Object mask) { - return input.startsWith(prefix.toString()); + public boolean doBytes(byte[] input, byte[] prefix, @SuppressWarnings("unused") Object mask) { + return ArrayUtils.regionEqualsWithOrMask(input, 0, prefix, 0, prefix.length, null); } @Specialization(guards = "mask != null") - public boolean startsWithWithMask(String input, StringUTF16 prefix, StringUTF16 mask) { - return ArrayUtils.regionEqualsWithOrMask(input, 0, prefix.toString(), 0, mask.encodedLength(), mask.toString()); + public boolean doBytesMask(byte[] input, byte[] prefix, byte[] mask) { + return ArrayUtils.regionEqualsWithOrMask(input, 0, prefix, 0, prefix.length, mask); } @Specialization(guards = "mask == null") - public boolean startsWithTruffleObjNoMask(TruffleObject input, StringUTF16 prefix, @SuppressWarnings("unused") Object mask, - @Cached("create()") InputLengthNode lengthNode, - @Cached("create()") InputReadNode charAtNode) { + public boolean doString(String input, String prefix, @SuppressWarnings("unused") Object mask) { + return input.startsWith(prefix); + } + + @Specialization(guards = "mask != null") + public boolean doStringMask(String input, String prefix, String mask) { + return ArrayUtils.regionEqualsWithOrMask(input, 0, prefix, 0, mask.length(), mask); + } + + @Specialization(guards = "mask == null") + public boolean doTruffleObjBytes(TruffleObject input, byte[] prefix, @SuppressWarnings("unused") Object mask, + @Cached InputLengthNode lengthNode, + @Cached InputReadNode charAtNode) { + return startsWithTruffleObj(input, prefix, null, lengthNode, charAtNode); + } + + @Specialization(guards = "mask != null") + public boolean doTruffleObjBytesMask(TruffleObject input, byte[] prefix, byte[] mask, + @Cached InputLengthNode lengthNode, + @Cached InputReadNode charAtNode) { + assert mask.length == prefix.length; + return startsWithTruffleObj(input, prefix, mask, lengthNode, charAtNode); + } + + @Specialization(guards = "mask == null") + public boolean doTruffleObjString(TruffleObject input, String prefix, @SuppressWarnings("unused") Object mask, + @Cached InputLengthNode lengthNode, + @Cached InputReadNode charAtNode) { return startsWithTruffleObj(input, prefix, null, lengthNode, charAtNode); } @Specialization(guards = "mask != null") - public boolean startsWithTruffleObjWithMask(TruffleObject input, StringUTF16 prefix, StringUTF16 mask, - @Cached("create()") InputLengthNode lengthNode, - @Cached("create()") InputReadNode charAtNode) { - assert mask.encodedLength() == prefix.encodedLength(); + public boolean doTruffleObjStringMask(TruffleObject input, String prefix, String mask, + @Cached InputLengthNode lengthNode, + @Cached InputReadNode charAtNode) { + assert mask.length() == prefix.length(); return startsWithTruffleObj(input, prefix, mask, lengthNode, charAtNode); } - private static boolean startsWithTruffleObj(TruffleObject input, StringUTF16 prefix, StringUTF16 mask, InputLengthNode lengthNode, InputReadNode charAtNode) { - if (lengthNode.execute(input) < prefix.encodedLength()) { + private static boolean startsWithTruffleObj(TruffleObject input, byte[] prefix, byte[] mask, InputLengthNode lengthNode, InputReadNode charAtNode) { + if (lengthNode.execute(input) < prefix.length) { + return false; + } + for (int i = 0; i < prefix.length; i++) { + if (InputReadNode.readWithMask(input, i, mask, i, charAtNode) != Byte.toUnsignedInt(prefix[i])) { + return false; + } + } + return true; + } + + private static boolean startsWithTruffleObj(TruffleObject input, String prefix, String mask, InputLengthNode lengthNode, InputReadNode charAtNode) { + if (lengthNode.execute(input) < prefix.length()) { return false; } - for (int i = 0; i < prefix.encodedLength(); i++) { + for (int i = 0; i < prefix.length(); i++) { if (InputReadNode.readWithMask(input, i, mask, i, charAtNode) != prefix.charAt(i)) { return false; } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/nfa/TRegexBacktrackingNFAExecutorNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/nfa/TRegexBacktrackingNFAExecutorNode.java index cad52ebcda15..7315b69eba77 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/nfa/TRegexBacktrackingNFAExecutorNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/nfa/TRegexBacktrackingNFAExecutorNode.java @@ -111,7 +111,7 @@ public TRegexBacktrackingNFAExecutorNode(PureNFAMap nfaMap, PureNFA nfa, TRegexE } if (loopbackInitialState && innerLiteral == null) { CodePointSet initialCharSet = nfaMap.getMergedInitialStateCharSet(compilationBuffer); - loopbackInitialStateMatcher = initialCharSet == null ? null : CharMatchers.createMatcher(initialCharSet, nfaMap.getAst().getEncoding(), compilationBuffer); + loopbackInitialStateMatcher = initialCharSet == null ? null : CharMatchers.createMatcher(initialCharSet, compilationBuffer); } nfa.materializeGroupBoundaries(); matchers = new CharMatcher[nfa.getNumberOfStates()]; @@ -119,7 +119,7 @@ public TRegexBacktrackingNFAExecutorNode(PureNFAMap nfaMap, PureNFA nfa, TRegexE for (int i = 0; i < matchers.length; i++) { PureNFAState s = nfa.getState(i); if (s.isCharacterClass()) { - matchers[i] = insert(CharMatchers.createMatcher(s.getCharSet(), nfaMap.getAst().getEncoding(), compilationBuffer)); + matchers[i] = insert(CharMatchers.createMatcher(s.getCharSet(), compilationBuffer)); } maxTransitions = Math.max(maxTransitions, s.getSuccessors(forward).length); s.initIsDeterministic(forward, compilationBuffer); @@ -691,7 +691,7 @@ private int findInnerLiteral(TRegexBacktrackingNFAExecutorLocals locals) { CompilerDirectives.transferToInterpreterAndInvalidate(); indexOfNode = InputIndexOfStringNode.create(); } - return indexOfNode.execute(locals.getInput(), locals.getIndex(), locals.getMaxIndex(), innerLiteral.getLiteral(), innerLiteral.getMask()); + return indexOfNode.execute(locals.getInput(), locals.getIndex(), locals.getMaxIndex(), innerLiteral.getLiteral().content(), innerLiteral.getMaskContent()); } private boolean inputBoundsCheck(int i, int min, int max) { diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/nfa/TRegexLiteralLookAroundExecutorNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/nfa/TRegexLiteralLookAroundExecutorNode.java index 6bf6671690fe..e2da2215c1fc 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/nfa/TRegexLiteralLookAroundExecutorNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/nfa/TRegexLiteralLookAroundExecutorNode.java @@ -49,7 +49,6 @@ import com.oracle.truffle.regex.tregex.nodes.TRegexExecutorLocals; import com.oracle.truffle.regex.tregex.nodes.TRegexExecutorNode; import com.oracle.truffle.regex.tregex.parser.ast.LookAroundAssertion; -import com.oracle.truffle.regex.tregex.string.Encodings.Encoding; /** * Specialized {@link TRegexExecutorNode} for matching {@link LookAroundAssertion#isLiteral() @@ -61,13 +60,13 @@ public final class TRegexLiteralLookAroundExecutorNode extends TRegexExecutorNod private final boolean negated; @Children private CharMatcher[] matchers; - public TRegexLiteralLookAroundExecutorNode(LookAroundAssertion lookAround, Encoding encoding, CompilationBuffer compilationBuffer) { + public TRegexLiteralLookAroundExecutorNode(LookAroundAssertion lookAround, CompilationBuffer compilationBuffer) { assert lookAround.isLiteral(); forward = lookAround.isLookAheadAssertion(); negated = lookAround.isNegated(); matchers = new CharMatcher[lookAround.getLiteralLength()]; for (int i = 0; i < matchers.length; i++) { - CharMatcher matcher = CharMatchers.createMatcher(lookAround.getGroup().getFirstAlternative().get(i).asCharacterClass().getCharSet(), encoding, compilationBuffer); + CharMatcher matcher = CharMatchers.createMatcher(lookAround.getGroup().getFirstAlternative().get(i).asCharacterClass().getCharSet(), compilationBuffer); matchers[forward ? i : matchers.length - (i + 1)] = insert(matcher); } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/CaseFoldTable.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/CaseFoldTable.java index fac421cf172d..ffc28b4d04b7 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/CaseFoldTable.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/CaseFoldTable.java @@ -198,16 +198,6 @@ public int size() { return ranges.length / 4; } - @Override - public int getMinValue() { - return 0; - } - - @Override - public int getMaxValue() { - return 0x10ffff; - } - @Override public void appendRangesTo(RangesBuffer buffer, int startIndex, int endIndex) { throw new UnsupportedOperationException(); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexLexer.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexLexer.java index d07c7937a6e0..ced637d69511 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexLexer.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexLexer.java @@ -53,6 +53,7 @@ import com.oracle.truffle.regex.charset.CodePointSetAccumulator; import com.oracle.truffle.regex.charset.Constants; import com.oracle.truffle.regex.charset.UnicodeProperties; +import com.oracle.truffle.regex.tregex.string.Encodings.Encoding; import com.oracle.truffle.regex.util.CompilationFinalBitSet; public final class RegexLexer { @@ -67,6 +68,7 @@ public final class RegexLexer { private final RegexSource source; private final String pattern; private final RegexFlags flags; + private final Encoding encoding; private final RegexOptions options; private Token lastToken; private int index = 0; @@ -76,10 +78,11 @@ public final class RegexLexer { private final CodePointSetAccumulator curCharClass = new CodePointSetAccumulator(); private final CodePointSetAccumulator charClassCaseFoldTmp = new CodePointSetAccumulator(); - public RegexLexer(RegexSource source, RegexFlags flags, RegexOptions options) { + public RegexLexer(RegexSource source, RegexFlags flags, Encoding encoding, RegexOptions options) { this.source = source; this.pattern = source.getPattern(); this.flags = flags; + this.encoding = encoding; this.options = options; } @@ -257,7 +260,12 @@ private Token charClass(boolean invert) { CaseFoldTable.CaseFoldingAlgorithm caseFolding = flags.isUnicode() ? CaseFoldTable.CaseFoldingAlgorithm.ECMAScriptUnicode : CaseFoldTable.CaseFoldingAlgorithm.ECMAScriptNonUnicode; CaseFoldTable.applyCaseFold(curCharClass, charClassCaseFoldTmp, caseFolding); } - return Token.createCharClass(invert ? CodePointSet.createInverse(curCharClass.get()) : curCharClass.toCodePointSet(), wasSingleChar); + CodePointSet cps = pruneCharClass(curCharClass.toCodePointSet()); + return Token.createCharClass(invert ? cps.createInverse(encoding) : cps, wasSingleChar); + } + + private CodePointSet pruneCharClass(CodePointSet cps) { + return encoding.getFullSet().createIntersection(cps, curCharClass.getTmp()); } /* lexer */ @@ -266,7 +274,7 @@ private Token getNext() throws RegexSyntaxException { final char c = consumeChar(); switch (c) { case '.': - return Token.createCharClass(flags.isDotAll() ? Constants.DOT_ALL : Constants.DOT); + return Token.createCharClass(pruneCharClass(flags.isDotAll() ? Constants.DOT_ALL : Constants.DOT)); case '^': return Token.createCaret(); case '$': @@ -357,7 +365,7 @@ private Token parseEscape() throws RegexSyntaxException { // the case-folding step in the `charClass` method and call `Token::createCharClass` // directly. if (isPredefCharClass(c)) { - return Token.createCharClass(parsePredefCharClass(c)); + return Token.createCharClass(pruneCharClass(parsePredefCharClass(c))); } else if (flags.isUnicode() && (c == 'p' || c == 'P')) { return charClass(parseUnicodeCharacterProperty(c == 'P')); } else { @@ -624,8 +632,8 @@ private CodePointSet parseUnicodeCharacterProperty(boolean invert) throws RegexS throw syntaxError(ErrorMessages.ENDS_WITH_UNFINISHED_UNICODE_PROPERTY); } try { - CodePointSet propertySet = UnicodeProperties.getProperty(pattern.substring(namePos, index - 1)); - return invert ? propertySet.createInverse() : propertySet; + CodePointSet propertySet = encoding.getFullSet().createIntersection(UnicodeProperties.getProperty(pattern.substring(namePos, index - 1)), curCharClass.getTmp()); + return invert ? propertySet.createInverse(encoding) : propertySet; } catch (IllegalArgumentException e) { throw syntaxError(e.getMessage()); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexParser.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexParser.java index eea4ac837568..37893daf5d46 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexParser.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexParser.java @@ -79,6 +79,7 @@ import com.oracle.truffle.regex.tregex.parser.ast.visitors.NodeCountVisitor; import com.oracle.truffle.regex.tregex.parser.ast.visitors.SetSourceSectionVisitor; import com.oracle.truffle.regex.tregex.string.Encodings; +import com.oracle.truffle.regex.tregex.string.Encodings.Encoding; public final class RegexParser { @@ -136,12 +137,12 @@ public final class RegexParser { private final CompilationBuffer compilationBuffer; @TruffleBoundary - public RegexParser(RegexSource source, RegexOptions options, CompilationBuffer compilationBuffer) throws RegexSyntaxException { + public RegexParser(RegexSource source, RegexFlags flags, Encoding encoding, RegexOptions options, CompilationBuffer compilationBuffer) throws RegexSyntaxException { this.source = source; - this.flags = RegexFlags.parseFlags(source.getFlags()); + this.flags = flags; this.options = options; - this.lexer = new RegexLexer(source, flags, options); - this.ast = new RegexAST(source, flags, options, flags.isUnicode() && !options.isUTF16ExplodeAstralSymbols() ? Encodings.UTF_16 : Encodings.UTF_16_RAW); + this.lexer = new RegexLexer(source, flags, encoding, options); + this.ast = new RegexAST(source, flags, options, encoding); this.properties = ast.getProperties(); this.groupCount = ast.getGroupCount(); this.copyVisitor = new CopyVisitor(ast); @@ -151,7 +152,7 @@ public RegexParser(RegexSource source, RegexOptions options, CompilationBuffer c } private static Group parseRootLess(String pattern) throws RegexSyntaxException { - return new RegexParser(new RegexSource(pattern), RegexOptions.DEFAULT, new CompilationBuffer()).parse(false); + return new RegexParser(new RegexSource(pattern), RegexFlags.DEFAULT, Encodings.UTF_16_RAW, RegexOptions.DEFAULT, new CompilationBuffer(Encodings.UTF_16_RAW)).parse(false); } @TruffleBoundary @@ -347,7 +348,7 @@ private void optimizeLookAround() { // surrogates assert !flags.isUnicode() || !options.isUTF16ExplodeAstralSymbols() || cc.getCharSet().matchesNothing() || cc.getCharSet().getMax() <= 0xffff; assert !group.hasEnclosedCaptureGroups(); - cc.setCharSet(cc.getCharSet().createInverse()); + cc.setCharSet(cc.getCharSet().createInverse(ast.getEncoding())); ast.updatePropsCC(cc); curSequence.removeLastTerm(); Group wrapGroup = ast.createGroup(); @@ -402,12 +403,12 @@ private Term translateUnicodeCharClass(Token.CharacterClass token) { if (loneLeadSurrogateRanges.matchesSomething()) { Sequence loneLeadSurrogateAlternative = group.addSequence(ast); loneLeadSurrogateAlternative.add(createCharClass(loneLeadSurrogateRanges, token)); - loneLeadSurrogateAlternative.add(NO_TRAIL_SURROGATE_AHEAD.copy(ast, true)); + loneLeadSurrogateAlternative.add(NO_TRAIL_SURROGATE_AHEAD.copyRecursive(ast, compilationBuffer)); } if (loneTrailSurrogateRanges.matchesSomething()) { Sequence loneTrailSurrogateAlternative = group.addSequence(ast); - loneTrailSurrogateAlternative.add(NO_LEAD_SURROGATE_BEHIND.copy(ast, true)); + loneTrailSurrogateAlternative.add(NO_LEAD_SURROGATE_BEHIND.copyRecursive(ast, compilationBuffer)); loneTrailSurrogateAlternative.add(createCharClass(loneTrailSurrogateRanges, token)); } @@ -654,7 +655,7 @@ protected void visit(Group group) { } private void substitute(Token token, Group substitution) { - Group copy = substitution.copy(ast, true); + Group copy = substitution.copyRecursive(ast, compilationBuffer); if (options.isDumpAutomata()) { setSourceSectionVisitor.run(copy, token); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexValidator.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexValidator.java index 84ef68621c24..7b5d01d096dd 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexValidator.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexValidator.java @@ -51,6 +51,7 @@ import com.oracle.truffle.regex.RegexSyntaxException; import com.oracle.truffle.regex.UnsupportedRegexException; import com.oracle.truffle.regex.tregex.TRegexOptions; +import com.oracle.truffle.regex.tregex.string.Encodings; public class RegexValidator { @@ -62,7 +63,7 @@ public class RegexValidator { public RegexValidator(RegexSource source, RegexFlags flags, RegexOptions options) { this.source = source; this.flags = flags; - this.lexer = new RegexLexer(source, flags, options); + this.lexer = new RegexLexer(source, flags, flags.isUnicode() ? Encodings.UTF_16 : Encodings.UTF_16_RAW, options); } @TruffleBoundary diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/BackReference.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/BackReference.java index 5943a68cc15c..589d806b5c1d 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/BackReference.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/BackReference.java @@ -40,6 +40,7 @@ */ package com.oracle.truffle.regex.tregex.parser.ast; +import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; import com.oracle.truffle.regex.tregex.util.json.Json; import com.oracle.truffle.regex.tregex.util.json.JsonValue; @@ -66,10 +67,15 @@ private BackReference(BackReference copy) { } @Override - public BackReference copy(RegexAST ast, boolean recursive) { + public BackReference copy(RegexAST ast) { return ast.register(new BackReference(this)); } + @Override + public BackReference copyRecursive(RegexAST ast, CompilationBuffer compilationBuffer) { + return copy(ast); + } + /** * Returns the capture group number this back-reference is referring to, e.g. the referenced * group of {@code \1} is 1. diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/CharacterClass.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/CharacterClass.java index eae254a21bdd..9c2d3f70d031 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/CharacterClass.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/CharacterClass.java @@ -48,6 +48,7 @@ import com.oracle.truffle.regex.charset.CodePointSet; import com.oracle.truffle.regex.tregex.TRegexOptions; import com.oracle.truffle.regex.tregex.automaton.StateSet; +import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; import com.oracle.truffle.regex.tregex.parser.RegexParser; import com.oracle.truffle.regex.tregex.string.AbstractStringBuffer; import com.oracle.truffle.regex.tregex.util.json.Json; @@ -82,14 +83,19 @@ public class CharacterClass extends QuantifiableTerm { this.charSet = charSet; } - private CharacterClass(CharacterClass copy) { + private CharacterClass(CharacterClass copy, CodePointSet charSet) { super(copy); - charSet = copy.charSet; + this.charSet = charSet; + } + + @Override + public CharacterClass copy(RegexAST ast) { + return ast.register(new CharacterClass(this, charSet)); } @Override - public CharacterClass copy(RegexAST ast, boolean recursive) { - return ast.register(new CharacterClass(this)); + public CharacterClass copyRecursive(RegexAST ast, CompilationBuffer compilationBuffer) { + return ast.register(new CharacterClass(this, ast.getEncoding().getFullSet().createIntersection(charSet, compilationBuffer))); } @Override diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Group.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Group.java index fc28e2b24ef6..54167aab5096 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Group.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Group.java @@ -46,6 +46,7 @@ import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; import com.oracle.truffle.regex.UnsupportedRegexException; import com.oracle.truffle.regex.tregex.TRegexOptions; +import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; import com.oracle.truffle.regex.tregex.parser.ast.visitors.RegexASTVisitorIterable; import com.oracle.truffle.regex.tregex.util.json.Json; import com.oracle.truffle.regex.tregex.util.json.JsonValue; @@ -93,21 +94,25 @@ public final class Group extends QuantifiableTerm implements RegexASTVisitorIter setGroupNumber(groupNumber); } - private Group(Group copy, RegexAST ast, boolean recursive) { + private Group(Group copy) { super(copy); groupNumber = copy.groupNumber; enclosedCaptureGroupsLow = copy.enclosedCaptureGroupsLow; enclosedCaptureGroupsHigh = copy.enclosedCaptureGroupsHigh; - if (recursive) { - for (Sequence s : copy.alternatives) { - add(s.copy(ast, true)); - } - } } @Override - public Group copy(RegexAST ast, boolean recursive) { - return ast.register(new Group(this, ast, recursive)); + public Group copy(RegexAST ast) { + return ast.register(new Group(this)); + } + + @Override + public Group copyRecursive(RegexAST ast, CompilationBuffer compilationBuffer) { + Group copy = copy(ast); + for (Sequence s : alternatives) { + copy.add(s.copyRecursive(ast, compilationBuffer)); + } + return copy; } /** diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/InnerLiteral.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/InnerLiteral.java index 8a202d09d307..89b83e9b1463 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/InnerLiteral.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/InnerLiteral.java @@ -72,6 +72,10 @@ public AbstractString getMask() { return mask; } + public Object getMaskContent() { + return mask == null ? null : mask.content(); + } + /** * The maximum number of code points the regular expression may match before matching this * literal. Example: the inner literal of {@code /a?b/} is {@code "b"}, with a max prefix size diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/LookAheadAssertion.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/LookAheadAssertion.java index 21f003b16161..c13bc1f6f9fd 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/LookAheadAssertion.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/LookAheadAssertion.java @@ -40,6 +40,7 @@ */ package com.oracle.truffle.regex.tregex.parser.ast; +import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; import com.oracle.truffle.regex.tregex.util.json.JsonValue; import static com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; @@ -66,13 +67,22 @@ public class LookAheadAssertion extends LookAroundAssertion { super(negated); } - private LookAheadAssertion(LookAheadAssertion copy, RegexAST ast, boolean recursive) { - super(copy, ast, recursive); + private LookAheadAssertion(LookAheadAssertion copy, RegexAST ast) { + super(copy, ast); + } + + private LookAheadAssertion(LookAheadAssertion copy, RegexAST ast, CompilationBuffer compilationBuffer) { + super(copy, ast, compilationBuffer); + } + + @Override + public LookAheadAssertion copy(RegexAST ast) { + return ast.register(new LookAheadAssertion(this, ast)); } @Override - public LookAheadAssertion copy(RegexAST ast, boolean recursive) { - return ast.register(new LookAheadAssertion(this, ast, recursive)); + public LookAheadAssertion copyRecursive(RegexAST ast, CompilationBuffer compilationBuffer) { + return ast.register(new LookAheadAssertion(this, ast, compilationBuffer)); } @Override diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/LookAroundAssertion.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/LookAroundAssertion.java index 5522346f07a5..cd3a3e336931 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/LookAroundAssertion.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/LookAroundAssertion.java @@ -40,6 +40,8 @@ */ package com.oracle.truffle.regex.tregex.parser.ast; +import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; + /** * An assertion that succeeds depending on whether or not text surrounding the current position * matches a given regular expression. @@ -52,8 +54,12 @@ public abstract class LookAroundAssertion extends RegexASTSubtreeRootNode { setNegated(negated); } - LookAroundAssertion(LookAroundAssertion copy, RegexAST ast, boolean recursive) { - super(copy, ast, recursive); + LookAroundAssertion(LookAroundAssertion copy, RegexAST ast) { + super(copy, ast); + } + + LookAroundAssertion(LookAroundAssertion copy, RegexAST ast, CompilationBuffer compilationBuffer) { + super(copy, ast, compilationBuffer); } /** diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/LookBehindAssertion.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/LookBehindAssertion.java index 4ec5bc72a5dd..513b18d6c571 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/LookBehindAssertion.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/LookBehindAssertion.java @@ -40,6 +40,7 @@ */ package com.oracle.truffle.regex.tregex.parser.ast; +import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; import com.oracle.truffle.regex.tregex.util.json.JsonValue; import static com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; @@ -72,13 +73,22 @@ public class LookBehindAssertion extends LookAroundAssertion { super(negated); } - private LookBehindAssertion(LookBehindAssertion copy, RegexAST ast, boolean recursive) { - super(copy, ast, recursive); + private LookBehindAssertion(LookBehindAssertion copy, RegexAST ast) { + super(copy, ast); + } + + private LookBehindAssertion(LookBehindAssertion copy, RegexAST ast, CompilationBuffer compilationBuffer) { + super(copy, ast, compilationBuffer); + } + + @Override + public LookBehindAssertion copy(RegexAST ast) { + return ast.register(new LookBehindAssertion(this, ast)); } @Override - public LookBehindAssertion copy(RegexAST ast, boolean recursive) { - return ast.register(new LookBehindAssertion(this, ast, recursive)); + public LookBehindAssertion copyRecursive(RegexAST ast, CompilationBuffer compilationBuffer) { + return ast.register(new LookBehindAssertion(this, ast, compilationBuffer)); } @Override diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/MatchFound.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/MatchFound.java index 2bbdf3b290bd..60d5b3cb6ef1 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/MatchFound.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/MatchFound.java @@ -41,6 +41,7 @@ package com.oracle.truffle.regex.tregex.parser.ast; import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; +import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; import com.oracle.truffle.regex.tregex.util.json.JsonValue; /** @@ -68,7 +69,12 @@ public class MatchFound extends Term { private RegexASTNode next; @Override - public MatchFound copy(RegexAST ast, boolean recursive) { + public MatchFound copy(RegexAST ast) { + throw new UnsupportedOperationException(); + } + + @Override + public MatchFound copyRecursive(RegexAST ast, CompilationBuffer compilationBuffer) { throw new UnsupportedOperationException(); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/PositionAssertion.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/PositionAssertion.java index b44e32c3b2ac..37c18c298e72 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/PositionAssertion.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/PositionAssertion.java @@ -40,6 +40,7 @@ */ package com.oracle.truffle.regex.tregex.parser.ast; +import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; import com.oracle.truffle.regex.tregex.util.json.JsonValue; import static com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; @@ -93,10 +94,15 @@ private PositionAssertion(PositionAssertion copy) { } @Override - public PositionAssertion copy(RegexAST ast, boolean recursive) { + public PositionAssertion copy(RegexAST ast) { return ast.register(new PositionAssertion(this)); } + @Override + public Term copyRecursive(RegexAST ast, CompilationBuffer compilationBuffer) { + return copy(ast); + } + public RegexASTNode getNext() { return next; } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/QuantifiableTerm.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/QuantifiableTerm.java index a98b757a5c7e..4bb3be17723f 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/QuantifiableTerm.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/QuantifiableTerm.java @@ -65,7 +65,7 @@ public abstract class QuantifiableTerm extends Term { } @Override - public abstract QuantifiableTerm copy(RegexAST ast, boolean recursive); + public abstract QuantifiableTerm copy(RegexAST ast); public boolean hasQuantifier() { return quantifier != null; diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexAST.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexAST.java index 401802bb6dca..646e99917e15 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexAST.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexAST.java @@ -272,6 +272,7 @@ public BackReference createBackReference(int groupNumber) { } public CharacterClass createCharacterClass(CodePointSet matcherBuilder) { + assert encoding.getFullSet().contains(matcherBuilder); return register(new CharacterClass(matcherBuilder)); } @@ -388,12 +389,6 @@ public void invertNegativeLookAround(LookAroundAssertion assertion) { public PositionAssertion register(PositionAssertion positionAssertion) { nodeCount.inc(); - switch (positionAssertion.type) { - case CARET: - break; - case DOLLAR: - break; - } return positionAssertion; } @@ -544,7 +539,7 @@ public GroupBoundaries createGroupBoundaries(CompilationFinalBitSet updateIndice * set to true. */ private CharacterClass createPrefixAnyMatcher() { - final CharacterClass anyMatcher = createCharacterClass(CodePointSet.getFull()); + final CharacterClass anyMatcher = createCharacterClass(encoding.getFullSet()); anyMatcher.setPrefix(); return anyMatcher; } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTNode.java index e8baa02e7fc8..d528edad1090 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTNode.java @@ -41,6 +41,7 @@ package com.oracle.truffle.regex.tregex.parser.ast; import com.oracle.truffle.regex.tregex.TRegexOptions; +import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; import com.oracle.truffle.regex.tregex.parser.ast.visitors.CopyVisitor; import com.oracle.truffle.regex.tregex.parser.ast.visitors.MarkLookBehindEntriesVisitor; import com.oracle.truffle.regex.tregex.util.json.Json; @@ -86,19 +87,23 @@ protected RegexASTNode(RegexASTNode copy) { } /** - * Copy this node, in one of the following ways: - *
      - *
    • if {@code recursive} is {@code true}, recursively copy this subtree. This method should - * be used instead of {@link CopyVisitor} if the copying process is required to be thread-safe. - *
    • - *
    • else, copy this node only, without any child nodes.
    • - *
    - * In both cases, the ID and minPath of the copied nodes is left unset. + * Copy this node only, without any child nodes. The ID and minPath of the copied nodes is left + * unset. + * + * @param ast RegexAST the node should belong to. + * @return A shallow copy of this node. + */ + public abstract RegexASTNode copy(RegexAST ast); + + /** + * Recursively copy this subtree. This method should be used instead of {@link CopyVisitor} if + * the copying process is required to be thread-safe. The ID and minPath of the copied nodes is + * left unset. * * @param ast RegexAST the new nodes should belong to. - * @return A shallow or deep copy of this node. + * @return A deep copy of this node. */ - public abstract RegexASTNode copy(RegexAST ast, boolean recursive); + public abstract RegexASTNode copyRecursive(RegexAST ast, CompilationBuffer compilationBuffer); public abstract boolean equalsSemantic(RegexASTNode obj); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTRootNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTRootNode.java index 658948afef74..6864aeeff038 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTRootNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTRootNode.java @@ -40,6 +40,7 @@ */ package com.oracle.truffle.regex.tregex.parser.ast; +import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; import com.oracle.truffle.regex.tregex.parser.ast.visitors.InitIDVisitor; import com.oracle.truffle.regex.tregex.util.json.JsonValue; @@ -54,13 +55,22 @@ public class RegexASTRootNode extends RegexASTSubtreeRootNode { setId(InitIDVisitor.REGEX_AST_ROOT_PARENT_ID); } - private RegexASTRootNode(RegexASTRootNode copy, RegexAST ast, boolean recursive) { - super(copy, ast, recursive); + private RegexASTRootNode(RegexASTRootNode copy, RegexAST ast) { + super(copy, ast); + } + + private RegexASTRootNode(RegexASTRootNode copy, RegexAST ast, CompilationBuffer compilationBuffer) { + super(copy, ast, compilationBuffer); + } + + @Override + public RegexASTSubtreeRootNode copy(RegexAST ast) { + return new RegexASTRootNode(this, ast); } @Override - public RegexASTSubtreeRootNode copy(RegexAST ast, boolean recursive) { - return new RegexASTRootNode(this, ast, recursive); + public RegexASTSubtreeRootNode copyRecursive(RegexAST ast, CompilationBuffer compilationBuffer) { + return new RegexASTRootNode(this, ast, compilationBuffer); } @Override diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTSubtreeRootNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTSubtreeRootNode.java index 487b98587d6f..9fc018da52f2 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTSubtreeRootNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTSubtreeRootNode.java @@ -41,6 +41,7 @@ package com.oracle.truffle.regex.tregex.parser.ast; import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; +import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; import com.oracle.truffle.regex.tregex.parser.ast.visitors.RegexASTVisitorIterable; import com.oracle.truffle.regex.tregex.util.json.Json; import com.oracle.truffle.regex.tregex.util.json.JsonObject; @@ -64,14 +65,22 @@ public abstract class RegexASTSubtreeRootNode extends Term implements RegexASTVi RegexASTSubtreeRootNode() { } - RegexASTSubtreeRootNode(RegexASTSubtreeRootNode copy, RegexAST ast, boolean recursive) { + /** + * Shallow copy constructor. + */ + RegexASTSubtreeRootNode(RegexASTSubtreeRootNode copy, RegexAST ast) { super(copy); - if (recursive) { - setGroup(copy.group.copy(ast, true)); - } ast.createNFAHelperNodes(this); } + /** + * Recursive copy constructor. + */ + RegexASTSubtreeRootNode(RegexASTSubtreeRootNode copy, RegexAST ast, CompilationBuffer compilationBuffer) { + this(copy, ast); + setGroup(copy.group.copyRecursive(ast, compilationBuffer)); + } + public boolean subTreeIdInitialized() { return subTreeId >= 0; } @@ -85,7 +94,7 @@ public void setSubTreeId(int subTreeId) { } @Override - public abstract RegexASTSubtreeRootNode copy(RegexAST ast, boolean recursive); + public abstract RegexASTSubtreeRootNode copy(RegexAST ast); /** * Returns the {@link Group} that represents the contents of this subtree. diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Sequence.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Sequence.java index 4d9dece5a9f6..06d37c7c55df 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Sequence.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Sequence.java @@ -44,6 +44,7 @@ import java.util.stream.Collectors; import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; +import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; import com.oracle.truffle.regex.tregex.parser.ast.visitors.RegexASTVisitorIterable; import com.oracle.truffle.regex.tregex.util.json.Json; import com.oracle.truffle.regex.tregex.util.json.JsonValue; @@ -65,18 +66,22 @@ public final class Sequence extends RegexASTNode implements RegexASTVisitorItera Sequence() { } - private Sequence(Sequence copy, RegexAST ast, boolean recursive) { + private Sequence(Sequence copy) { super(copy); - if (recursive) { - for (Term t : copy.terms) { - add(t.copy(ast, true)); - } - } } @Override - public Sequence copy(RegexAST ast, boolean recursive) { - return ast.register(new Sequence(this, ast, recursive)); + public Sequence copy(RegexAST ast) { + return ast.register(new Sequence(this)); + } + + @Override + public Sequence copyRecursive(RegexAST ast, CompilationBuffer compilationBuffer) { + Sequence copy = copy(ast); + for (Term t : terms) { + copy.add(t.copyRecursive(ast, compilationBuffer)); + } + return copy; } @Override diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Term.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Term.java index 536d044fb6fc..bc7f05b12def 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Term.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Term.java @@ -43,6 +43,7 @@ import com.oracle.truffle.regex.UnsupportedRegexException; import com.oracle.truffle.regex.tregex.TRegexOptions; import com.oracle.truffle.regex.tregex.automaton.AbstractState; +import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; import com.oracle.truffle.regex.tregex.nfa.ASTTransition; /** @@ -64,7 +65,10 @@ public abstract class Term extends RegexASTNode implements AbstractState { AbstractStringIterator iterator(); int encodedLength(); + + Object content(); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/Encodings.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/Encodings.java index ccdea1a632cc..7b7b24f4c635 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/Encodings.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/Encodings.java @@ -40,11 +40,22 @@ */ package com.oracle.truffle.regex.tregex.string; +import com.oracle.truffle.regex.charset.CharMatchers; import com.oracle.truffle.regex.charset.CodePointSet; import com.oracle.truffle.regex.charset.Constants; +import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; +import com.oracle.truffle.regex.tregex.nodes.dfa.DFAStateNode; +import com.oracle.truffle.regex.tregex.nodes.dfa.DFAStateNode.LoopOptIndexOfAnyByteNode; +import com.oracle.truffle.regex.tregex.nodes.dfa.DFAStateNode.LoopOptIndexOfAnyCharNode; +import com.oracle.truffle.regex.tregex.nodes.dfa.DFAStateNode.LoopOptIndexOfStringNode; +import com.oracle.truffle.regex.tregex.nodes.dfa.DFAStateNode.LoopOptimizationNode; +import com.oracle.truffle.regex.tregex.nodes.dfa.Matchers; +import com.oracle.truffle.regex.tregex.nodes.dfa.Matchers.Builder; public final class Encodings { + private static final CodePointSet FULL_UNICODE_SET = CodePointSet.createNoDedup(0, 0x10ffff); + public static final Encoding UTF_8 = new Encoding.UTF8(); public static final Encoding UTF_16 = new Encoding.UTF16(); public static final Encoding UTF_32 = new Encoding.UTF32(); @@ -55,12 +66,32 @@ public abstract static class Encoding { public abstract String getName(); + public int getMinValue() { + return 0; + } + + public abstract int getMaxValue(); + + public abstract CodePointSet getFullSet(); + public abstract int getEncodedSize(int codepoint); public abstract boolean isFixedCodePointWidth(CodePointSet set); public abstract AbstractStringBuffer createStringBuffer(int capacity); + public abstract DFAStateNode.LoopOptimizationNode extractLoopOptNode(CodePointSet loopCPS); + + public abstract int getNumberOfDecodingSteps(); + + public Matchers.Builder createMatchersBuilder() { + return new Matchers.Builder(getNumberOfDecodingSteps()); + } + + public abstract void createMatcher(Builder matchersBuilder, int i, CodePointSet cps, CompilationBuffer compilationBuffer); + + public abstract Matchers toMatchers(Builder matchersBuilder); + public static final class UTF32 extends Encoding { private UTF32() { @@ -71,6 +102,16 @@ public String getName() { return "UTF-32"; } + @Override + public int getMaxValue() { + return Character.MAX_CODE_POINT; + } + + @Override + public CodePointSet getFullSet() { + return FULL_UNICODE_SET; + } + @Override public int getEncodedSize(int codepoint) { return 1; @@ -85,6 +126,27 @@ public boolean isFixedCodePointWidth(CodePointSet set) { public StringBufferUTF32 createStringBuffer(int capacity) { return new StringBufferUTF32(capacity); } + + @Override + public LoopOptimizationNode extractLoopOptNode(CodePointSet cps) { + // TODO: not implemented yet + return null; + } + + @Override + public int getNumberOfDecodingSteps() { + return 1; + } + + @Override + public void createMatcher(Builder matchersBuilder, int i, CodePointSet cps, CompilationBuffer compilationBuffer) { + matchersBuilder.getBuffer(0).set(i, CharMatchers.createMatcher(cps, compilationBuffer)); + } + + @Override + public Matchers toMatchers(Builder matchersBuilder) { + return new Matchers.SimpleMatchers(matchersBuilder.materialize(0), matchersBuilder.getNoMatchSuccessor()); + } } public static final class UTF16 extends Encoding { @@ -97,6 +159,16 @@ public String getName() { return "UTF-16"; } + @Override + public int getMaxValue() { + return Character.MAX_CODE_POINT; + } + + @Override + public CodePointSet getFullSet() { + return FULL_UNICODE_SET; + } + @Override public int getEncodedSize(int codepoint) { return codepoint < 0x10000 ? 1 : 2; @@ -125,6 +197,25 @@ public boolean isFixedCodePointWidth(CodePointSet set) { return !(min < 0x10000 && max > 0x10000); } + @Override + public LoopOptimizationNode extractLoopOptNode(CodePointSet cps) { + if (cps.inverseGetMax(this) <= 0xffff) { + char[] indexOfChars = cps.inverseToCharArray(this); + for (char c : indexOfChars) { + if (Constants.SURROGATES.contains(c)) { + return null; + } + } + return new LoopOptIndexOfAnyCharNode(indexOfChars); + } else if (cps.inverseValueCount(this) == 1) { + StringBufferUTF16 sb = createStringBuffer(2); + sb.append(cps.inverseGetMin(this)); + return new LoopOptIndexOfStringNode(sb.materialize(), null); + } else { + return null; + } + } + public static boolean isHighSurrogate(int c, boolean forward) { return forward ? isHighSurrogate(c) : isLowSurrogate(c); } @@ -147,10 +238,32 @@ public static boolean isLowSurrogate(int c) { public StringBufferUTF16 createStringBuffer(int capacity) { return new StringBufferUTF16(capacity); } + + @Override + public int getNumberOfDecodingSteps() { + return 2; + } + + @Override + public Matchers.Builder createMatchersBuilder() { + return new Matchers.Builder(3); + } + + @Override + public void createMatcher(Builder matchersBuilder, int i, CodePointSet cps, CompilationBuffer compilationBuffer) { + matchersBuilder.createSplitMatcher(i, cps, compilationBuffer, Constants.BYTE_RANGE, Constants.BMP_RANGE_WITHOUT_LATIN1, Constants.ASTRAL_SYMBOLS); + } + + @Override + public Matchers toMatchers(Builder matchersBuilder) { + return new Matchers.UTF16Matchers(matchersBuilder.materialize(0), matchersBuilder.materialize(1), matchersBuilder.materialize(2), matchersBuilder.getNoMatchSuccessor()); + } } public static final class UTF16Raw extends Encoding { + private static final CodePointSet UTF16_RAW_FULL_SET = CodePointSet.createNoDedup(0, 0xffff); + private UTF16Raw() { } @@ -159,6 +272,16 @@ public String getName() { return "UTF-16-RAW"; } + @Override + public int getMaxValue() { + return Character.MAX_VALUE; + } + + @Override + public CodePointSet getFullSet() { + return UTF16_RAW_FULL_SET; + } + @Override public int getEncodedSize(int codepoint) { return codepoint < 0x10000 ? 1 : 2; @@ -173,6 +296,32 @@ public boolean isFixedCodePointWidth(CodePointSet set) { public StringBufferUTF16 createStringBuffer(int capacity) { return new StringBufferUTF16(capacity); } + + @Override + public LoopOptimizationNode extractLoopOptNode(CodePointSet cps) { + return new LoopOptIndexOfAnyCharNode(cps.inverseToCharArray(this)); + } + + @Override + public int getNumberOfDecodingSteps() { + return 1; + } + + @Override + public Matchers.Builder createMatchersBuilder() { + return new Matchers.Builder(2); + } + + @Override + public void createMatcher(Builder matchersBuilder, int i, CodePointSet cps, CompilationBuffer compilationBuffer) { + assert cps.getMax() <= getMaxValue(); + matchersBuilder.createSplitMatcher(i, cps, compilationBuffer, Constants.BYTE_RANGE, Constants.BMP_RANGE_WITHOUT_LATIN1); + } + + @Override + public Matchers toMatchers(Builder matchersBuilder) { + return new Matchers.UTF16RawMatchers(matchersBuilder.materialize(0), matchersBuilder.materialize(1), matchersBuilder.getNoMatchSuccessor()); + } } public static final class UTF8 extends Encoding { @@ -182,6 +331,16 @@ public String getName() { return "UTF-8"; } + @Override + public int getMaxValue() { + return Character.MAX_CODE_POINT; + } + + @Override + public CodePointSet getFullSet() { + return FULL_UNICODE_SET; + } + @Override public int getEncodedSize(int codepoint) { if (codepoint < 0x80) { @@ -221,6 +380,36 @@ public boolean isFixedCodePointWidth(CodePointSet set) { public StringBufferUTF8 createStringBuffer(int capacity) { return new StringBufferUTF8(capacity); } + + @Override + public LoopOptimizationNode extractLoopOptNode(CodePointSet cps) { + if (cps.inverseGetMax(this) <= 0x7f) { + byte[] indexOfChars = cps.inverseToByteArray(this); + return new LoopOptIndexOfAnyByteNode(indexOfChars); + } else if (cps.inverseValueCount(this) == 1) { + StringBufferUTF8 sb = createStringBuffer(4); + sb.append(cps.inverseGetMin(this)); + return new LoopOptIndexOfStringNode(sb.materialize(), new StringUTF8(new byte[sb.length()])); + } else { + return null; + } + } + + @Override + public int getNumberOfDecodingSteps() { + return 4; + } + + @Override + public void createMatcher(Builder matchersBuilder, int i, CodePointSet cps, CompilationBuffer compilationBuffer) { + matchersBuilder.createSplitMatcher(i, cps, compilationBuffer, Constants.ASCII_RANGE, Constants.UTF8_TWO_BYTE_RANGE, Constants.UTF8_THREE_BYTE_RANGE, Constants.ASTRAL_SYMBOLS); + } + + @Override + public Matchers toMatchers(Builder matchersBuilder) { + return new Matchers.UTF8Matchers(matchersBuilder.materialize(0), matchersBuilder.materialize(1), matchersBuilder.materialize(2), matchersBuilder.materialize(3), + matchersBuilder.getNoMatchSuccessor()); + } } } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/StringUTF16.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/StringUTF16.java index a9230d64ff43..b769543b0622 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/StringUTF16.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/StringUTF16.java @@ -62,6 +62,11 @@ public String toString() { return str; } + @Override + public Object content() { + return str; + } + @Override public AbstractStringIterator iterator() { return new StringUTF16Iterator(str); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/StringUTF32.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/StringUTF32.java index b31a67b98915..061be2231bfb 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/StringUTF32.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/StringUTF32.java @@ -53,6 +53,11 @@ public int encodedLength() { return str.length; } + @Override + public Object content() { + return str; + } + @Override public AbstractStringIterator iterator() { return new StringUTF32Iterator(str); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/StringUTF8.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/StringUTF8.java index d31fc20208cb..f354d7f6552e 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/StringUTF8.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/StringUTF8.java @@ -40,9 +40,11 @@ */ package com.oracle.truffle.regex.tregex.string; +import com.oracle.truffle.api.CompilerDirectives.CompilationFinal; + public final class StringUTF8 implements AbstractString { - private final byte[] str; + @CompilationFinal(dimensions = 1) private final byte[] str; public StringUTF8(byte[] str) { this.str = str; @@ -53,6 +55,11 @@ public int encodedLength() { return str.length; } + @Override + public Object content() { + return str; + } + @Override public AbstractStringIterator iterator() { return new StringUTF8Iterator(str); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/DFAExport.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/DFAExport.java index e4bb58f3e842..78f7248ec652 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/DFAExport.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/DFAExport.java @@ -52,6 +52,7 @@ import com.oracle.truffle.regex.tregex.matchers.SingleCharMatcher; import com.oracle.truffle.regex.tregex.matchers.SingleRangeMatcher; import com.oracle.truffle.regex.tregex.nodes.dfa.DFAStateNode; +import com.oracle.truffle.regex.tregex.nodes.dfa.Matchers.SimpleMatchers; import java.io.BufferedWriter; import java.io.IOException; @@ -105,7 +106,7 @@ public static void exportDot(DFAGenerator dfaGenerator, TruffleFile path, boolea } } for (DFAStateTransitionBuilder t : state.getSuccessors()) { - DotExport.printConnection(writer, dotState(state, shortLabels), dotState(t.getTarget(), shortLabels), t.getMatcherBuilder().toString()); + DotExport.printConnection(writer, dotState(state, shortLabels), dotState(t.getTarget(), shortLabels), t.getCodePointSet().toString()); } } writer.write("}"); @@ -130,10 +131,10 @@ public static void exportUnitTest(DFAStateNode entry, DFAStateNode[] states) { } System.out.println(" });"); System.out.printf("states[%d].setMatchers(new ByteMatcher[] {\n ", state.getId()); - printMatcher(state.getMatchers()[0]); - for (int i = 1; i < state.getMatchers().length; i++) { + printMatcher(((SimpleMatchers) state.getMatchers()).getMatchers()[0]); + for (int i = 1; i < state.getMatchers().size(); i++) { System.out.print(",\n "); - printMatcher(state.getMatchers()[i]); + printMatcher(((SimpleMatchers) state.getMatchers()).getMatchers()[i]); } System.out.println("\n});"); if (state.isFinalState()) { @@ -153,7 +154,7 @@ private static void printMatcher(CharMatcher matcher) { System.out.printf("RangeByteMatcher.create(0x%02x, 0x%02x)", ((SingleRangeMatcher) matcher).getLo(), ((SingleRangeMatcher) matcher).getHi()); } if (matcher instanceof BitSetMatcher) { - long[] words = ((BitSetMatcher) matcher).getBitSet().toLongArray(); + long[] words = ((BitSetMatcher) matcher).getBitSet(); System.out.printf("MultiByteMatcher.create(new CompilationFinalBitSet(new long[] {\n 0x%016xL", words[0]); for (int i = 1; i < words.length; i++) { System.out.printf(", 0x%016xL", words[i]); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/DebugUtil.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/DebugUtil.java index 5020ba88f9e7..92d143a89d0f 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/DebugUtil.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/DebugUtil.java @@ -40,7 +40,6 @@ */ package com.oracle.truffle.regex.tregex.util; -import com.oracle.truffle.api.TruffleLogger; import com.oracle.truffle.regex.charset.Constants; import com.oracle.truffle.regex.charset.SortedListOfRanges; import com.oracle.truffle.regex.util.CompilationFinalBitSet; @@ -53,15 +52,6 @@ public class DebugUtil { - public static final TruffleLogger LOG_SWITCH_TO_EAGER = TruffleLogger.getLogger("regex", "SwitchToEager"); - public static final TruffleLogger LOG_TOTAL_COMPILATION_TIME = TruffleLogger.getLogger("regex", "TotalCompilationTime"); - public static final TruffleLogger LOG_PHASES = TruffleLogger.getLogger("regex", "Phases"); - public static final TruffleLogger LOG_BAILOUT_MESSAGES = TruffleLogger.getLogger("regex", "BailoutMessages"); - public static final TruffleLogger LOG_AUTOMATON_SIZES = TruffleLogger.getLogger("regex", "AutomatonSizes"); - public static final TruffleLogger LOG_COMPILER_FALLBACK = TruffleLogger.getLogger("regex", "CompilerFallback"); - public static final TruffleLogger LOG_INTERNAL_ERRORS = TruffleLogger.getLogger("regex", "InternalErrors"); - public static final TruffleLogger LOG_TREGEX_COMPILATIONS = TruffleLogger.getLogger("regex", "TRegexCompilations"); - private static final CompilationFinalBitSet validSpecialCharsForFileNames = CompilationFinalBitSet.valueOf( '^', '$', '.', '*', '+', '-', '?', '(', ')', '[', ']', '{', '}', '|'); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputLastIndexOfNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/Loggers.java similarity index 61% rename from regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputLastIndexOfNode.java rename to regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/Loggers.java index c25b7c62a144..ee93ff7893b7 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/input/InputLastIndexOfNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/Loggers.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -38,39 +38,18 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -package com.oracle.truffle.regex.tregex.nodes.input; +package com.oracle.truffle.regex.tregex.util; -import com.oracle.truffle.api.dsl.Cached; -import com.oracle.truffle.api.dsl.Specialization; -import com.oracle.truffle.api.interop.TruffleObject; -import com.oracle.truffle.api.nodes.Node; -import com.oracle.truffle.regex.tregex.util.Boundaries; +import com.oracle.truffle.api.TruffleLogger; -public abstract class InputLastIndexOfNode extends Node { +public final class Loggers { - public static InputLastIndexOfNode create() { - return InputLastIndexOfNodeGen.create(); - } - - public abstract int execute(Object input, char c, int fromIndex, int maxIndex); - - @Specialization - public int lastIndexOf(String input, char c, int fromIndex, int maxIndex) { - int index = Boundaries.stringLastIndexOf(input, c, fromIndex); - if (index < maxIndex) { - return -1; - } - return index; - } - - @Specialization - public int lastIndexOf(TruffleObject input, char c, int fromIndex, int maxIndex, - @Cached("create()") InputReadNode charAtNode) { - for (int i = fromIndex; i >= maxIndex; i--) { - if (charAtNode.execute(input, i) == c) { - return i; - } - } - return -1; - } + public static final TruffleLogger LOG_SWITCH_TO_EAGER = TruffleLogger.getLogger("regex", "SwitchToEager"); + public static final TruffleLogger LOG_TOTAL_COMPILATION_TIME = TruffleLogger.getLogger("regex", "TotalCompilationTime"); + public static final TruffleLogger LOG_PHASES = TruffleLogger.getLogger("regex", "Phases"); + public static final TruffleLogger LOG_BAILOUT_MESSAGES = TruffleLogger.getLogger("regex", "BailoutMessages"); + public static final TruffleLogger LOG_AUTOMATON_SIZES = TruffleLogger.getLogger("regex", "AutomatonSizes"); + public static final TruffleLogger LOG_COMPILER_FALLBACK = TruffleLogger.getLogger("regex", "CompilerFallback"); + public static final TruffleLogger LOG_INTERNAL_ERRORS = TruffleLogger.getLogger("regex", "InternalErrors"); + public static final TruffleLogger LOG_TREGEX_COMPILATIONS = TruffleLogger.getLogger("regex", "TRegexCompilations"); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/util/BitSets.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/util/BitSets.java index 9bbb90f20ecc..896f8de42f54 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/util/BitSets.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/util/BitSets.java @@ -47,6 +47,14 @@ public final class BitSets { + public static int highByte(int c) { + return c >> Byte.SIZE; + } + + public static int lowByte(int c) { + return c & 0xff; + } + public static long[] createBitSetArray(int nbits) { return new long[wordIndex(nbits - 1) + 1]; } @@ -324,6 +332,25 @@ public void remove() { } } + @TruffleBoundary + public static String toString(long[] bs) { + StringBuilder sb = new StringBuilder("[ "); + int last = -2; + int rangeBegin = -2; + PrimitiveIterator.OfInt it = new BitSetIterator(bs); + while (it.hasNext()) { + int b = it.nextInt(); + if (b != last + 1) { + appendRange(sb, rangeBegin, last); + rangeBegin = b; + } + last = b; + } + appendRange(sb, rangeBegin, last); + sb.append(']'); + return sb.toString(); + } + @TruffleBoundary public static String toString(Iterable bs) { StringBuilder sb = new StringBuilder("[ ");