Skip to content

Commit

Permalink
TRegex: preparations for UTF-8 support
Browse files Browse the repository at this point in the history
  • Loading branch information
djoooooe committed Jun 10, 2020
1 parent 706a8d4 commit 9200dff
Show file tree
Hide file tree
Showing 83 changed files with 2,290 additions and 1,370 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import com.oracle.truffle.regex.charset.CodePointSetAccumulator;
import com.oracle.truffle.regex.charset.Range;
import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer;
import com.oracle.truffle.regex.tregex.string.Encodings;

public class MatcherBuilderTest {

Expand Down Expand Up @@ -110,25 +111,26 @@ private static void checkContains(CodePointSet a, CodePointSet b, boolean expect
}

private static void checkInverse(CodePointSet a, int... values) {
checkMatch("inverse(" + a + ")", a.createInverse(), values);
checkMatch("inverse(" + a + ")", a.createInverse(Encodings.UTF_16), values);
}

private static void checkIntersection(CodePointSet a, CodePointSet b, int... values) {
CodePointSet intersection = a.createIntersection(b, new CompilationBuffer());
CompilationBuffer compilationBuffer = new CompilationBuffer(Encodings.UTF_16);
CodePointSet intersection = a.createIntersection(b, compilationBuffer);
checkMatch("intersection(" + a + "," + b + ")", intersection, values);
assertTrue("intersection(" + a + "," + b + ")", a.intersects(b) == intersection.matchesSomething());
CodePointSet.IntersectAndSubtractResult<CodePointSet> result = a.intersectAndSubtract(b, new CompilationBuffer());
checkMatch("intersectAndSubtract(" + a + "," + b + ")[0]", result.subtractedA, a.subtract(intersection, new CompilationBuffer()));
checkMatch("intersectAndSubtract(" + a + "," + b + ")[1]", result.subtractedB, b.subtract(intersection, new CompilationBuffer()));
CodePointSet.IntersectAndSubtractResult<CodePointSet> result = a.intersectAndSubtract(b, compilationBuffer);
checkMatch("intersectAndSubtract(" + a + "," + b + ")[0]", result.subtractedA, a.subtract(intersection, compilationBuffer));
checkMatch("intersectAndSubtract(" + a + "," + b + ")[1]", result.subtractedB, b.subtract(intersection, compilationBuffer));
checkMatch("intersectAndSubtract(" + a + "," + b + ")[2]", result.intersection, intersection);
}

private static void checkSubtraction(CodePointSet a, CodePointSet b, int... values) {
checkMatch("subtraction(" + a + "," + b + ")", a.subtract(b, new CompilationBuffer()), values);
checkMatch("subtraction(" + a + "," + b + ")", a.subtract(b, new CompilationBuffer(Encodings.UTF_16)), values);
}

private static void checkUnion(CodePointSet a, CodePointSet b, int... values) {
checkMatch("union(" + a + "," + b + ")", a.union(b, new CompilationBuffer()), values);
checkMatch("union(" + a + "," + b + ")", a.union(b, new CompilationBuffer(Encodings.UTF_16)), values);
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
import org.junit.Test;

import com.oracle.truffle.regex.charset.RangesBuffer;
import com.oracle.truffle.regex.tregex.buffer.CharRangesBuffer;
import com.oracle.truffle.regex.tregex.buffer.IntRangesBuffer;

public class RangesBufferTest {
Expand All @@ -55,23 +54,14 @@ private static void appendAll(RangesBuffer buf, int[] content) {
}
}

private static CharRangesBuffer createCharRangesBuffer(int[] content) {
CharRangesBuffer buf = new CharRangesBuffer();
appendAll(buf, content);
return buf;
}

private static IntRangesBuffer createIntRangesBuffer(int[] content) {
IntRangesBuffer buf = new IntRangesBuffer();
appendAll(buf, content);
return buf;
}

private static boolean equals(RangesBuffer buf, int[] content) {
if (buf instanceof IntRangesBuffer) {
return equals((IntRangesBuffer) buf, content);
}
return equals((CharRangesBuffer) buf, content);
return equals((IntRangesBuffer) buf, content);
}

private static boolean equals(IntRangesBuffer buf, int[] content) {
Expand All @@ -86,18 +76,6 @@ private static boolean equals(IntRangesBuffer buf, int[] content) {
return true;
}

private static boolean equals(CharRangesBuffer buf, int[] content) {
if (buf.size() != content.length / 2) {
return false;
}
for (int i = 0; i < content.length / 2; i++) {
if (buf.getLo(i) != (char) content[i * 2] || buf.getHi(i) != (char) content[i * 2 + 1]) {
return false;
}
}
return true;
}

private static String matchError(String errorMsg, RangesBuffer m, int[] expected) {
StringBuilder sb = new StringBuilder(errorMsg).append(": got ").append(m.toString()).append(", expected [ ");
for (int i = 0; i < expected.length; i += 2) {
Expand All @@ -115,34 +93,23 @@ private static void checkAddRange(RangesBuffer buf, int lo, int hi, int[] expect

private static void checkAddRange(int[] buf, int lo, int hi, int[] expected) {
checkAddRange(createIntRangesBuffer(buf), lo, hi, expected);
checkAddRange(createCharRangesBuffer(buf), lo, hi, expected);
}

private static void checkAddRange(int[] buf, int lo, int hi, int[] expected, boolean intBuffer) {
if (intBuffer) {
checkAddRange(createIntRangesBuffer(buf), lo, hi, expected);
} else {
checkAddRange(createCharRangesBuffer(buf), lo, hi, expected);
}
}

@Test
public void testAddRange() {
int max = Character.MAX_VALUE;
checkAddRange(new int[]{}, 0, 1, new int[]{0, 1});
for (boolean intBuffer : new boolean[]{true, false}) {
int max = intBuffer ? Character.MAX_CODE_POINT : Character.MAX_VALUE;
checkAddRange(new int[]{}, 0, max, new int[]{0, max}, intBuffer);
checkAddRange(new int[]{0, 0}, 0, max, new int[]{0, max}, intBuffer);
checkAddRange(new int[]{1, 1}, 0, max, new int[]{0, max}, intBuffer);
checkAddRange(new int[]{0, 2}, 0, max, new int[]{0, max}, intBuffer);
checkAddRange(new int[]{max - 1, max}, 0, max, new int[]{0, max}, intBuffer);
checkAddRange(new int[]{max - 1, max - 1}, 0, max, new int[]{0, max}, intBuffer);
checkAddRange(new int[]{max, max}, 0, max, new int[]{0, max}, intBuffer);
checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, 0, max, new int[]{0, max}, intBuffer);
checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, 0, max, new int[]{0, max}, intBuffer);
checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, max, max, new int[]{2, 2, 4, 4, 6, 7, max, max}, intBuffer);
checkAddRange(new int[]{2, 2, 4, 4, 6, 7, max, max}, max - 1, max - 1, new int[]{2, 2, 4, 4, 6, 7, max - 1, max}, intBuffer);
}
checkAddRange(new int[]{}, 0, max, new int[]{0, max});
checkAddRange(new int[]{0, 0}, 0, max, new int[]{0, max});
checkAddRange(new int[]{1, 1}, 0, max, new int[]{0, max});
checkAddRange(new int[]{0, 2}, 0, max, new int[]{0, max});
checkAddRange(new int[]{max - 1, max}, 0, max, new int[]{0, max});
checkAddRange(new int[]{max - 1, max - 1}, 0, max, new int[]{0, max});
checkAddRange(new int[]{max, max}, 0, max, new int[]{0, max});
checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, 0, max, new int[]{0, max});
checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, 0, max, new int[]{0, max});
checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, max, max, new int[]{2, 2, 4, 4, 6, 7, max, max});
checkAddRange(new int[]{2, 2, 4, 4, 6, 7, max, max}, max - 1, max - 1, new int[]{2, 2, 4, 4, 6, 7, max - 1, max});
checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, 2, 2, new int[]{2, 2, 4, 4, 6, 7});
checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, 9, 9, new int[]{2, 2, 4, 4, 6, 7, 9, 9});
checkAddRange(new int[]{2, 2, 4, 4, 6, 7}, 8, 8, new int[]{2, 2, 4, 4, 6, 8});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,12 @@
*/
package com.oracle.truffle.regex;

import java.util.logging.Level;

import com.oracle.truffle.api.CompilerDirectives;
import com.oracle.truffle.api.interop.TruffleObject;
import com.oracle.truffle.regex.tregex.util.DebugUtil;

import java.util.logging.Level;

import static com.oracle.truffle.regex.tregex.util.DebugUtil.LOG_BAILOUT_MESSAGES;
import static com.oracle.truffle.regex.tregex.util.DebugUtil.LOG_COMPILER_FALLBACK;
import static com.oracle.truffle.regex.tregex.util.DebugUtil.LOG_TOTAL_COMPILATION_TIME;
import com.oracle.truffle.regex.tregex.util.Loggers;

public class RegexCompilerWithFallback implements RegexCompiler {

Expand Down Expand Up @@ -79,9 +76,9 @@ public Object compile(RegexSource regexSource) throws RegexSyntaxException, Unsu
if (shouldLog) {
elapsedTimeMain = timer.getElapsed();
}
LOG_COMPILER_FALLBACK.finer(() -> "Primary compiler used: " + regexSource);
Loggers.LOG_COMPILER_FALLBACK.finer(() -> "Primary compiler used: " + regexSource);
} catch (UnsupportedRegexException mainBailout) {
LOG_BAILOUT_MESSAGES.fine(() -> mainBailout.getReason() + ": " + regexSource);
Loggers.LOG_BAILOUT_MESSAGES.fine(() -> mainBailout.getReason() + ": " + regexSource);
try {
if (shouldLog) {
timer.start();
Expand All @@ -90,9 +87,9 @@ public Object compile(RegexSource regexSource) throws RegexSyntaxException, Unsu
if (shouldLog) {
elapsedTimeFallback = timer.getElapsed();
}
LOG_COMPILER_FALLBACK.fine(() -> String.format("Secondary compiler used (primary bailout due to '%s'): %s", mainBailout.getReason(), regexSource));
Loggers.LOG_COMPILER_FALLBACK.fine(() -> String.format("Secondary compiler used (primary bailout due to '%s'): %s", mainBailout.getReason(), regexSource));
} catch (UnsupportedRegexException fallbackBailout) {
LOG_COMPILER_FALLBACK.fine(() -> String.format("No compiler handled following regex (primary bailout: '%s'; secondary bailout: '%s'): %s", mainBailout.getReason(),
Loggers.LOG_COMPILER_FALLBACK.fine(() -> String.format("No compiler handled following regex (primary bailout: '%s'; secondary bailout: '%s'): %s", mainBailout.getReason(),
fallbackBailout.getReason(), regexSource));
String bailoutReasons = String.format("%s; %s", mainBailout.getReason(), fallbackBailout.getReason());
throw new UnsupportedRegexException(bailoutReasons, regexSource);
Expand All @@ -105,11 +102,11 @@ public Object compile(RegexSource regexSource) throws RegexSyntaxException, Unsu
}

private static boolean shouldLogCompilationTime() {
return LOG_TOTAL_COMPILATION_TIME.isLoggable(Level.FINE);
return Loggers.LOG_TOTAL_COMPILATION_TIME.isLoggable(Level.FINE);
}

private static void logCompilationTime(RegexSource regexSource, long elapsedTimeMain, long elapsedTimeFallback) {
LOG_TOTAL_COMPILATION_TIME.log(Level.FINE, "{0}, {1}, {2}, {3}", new Object[]{
Loggers.LOG_TOTAL_COMPILATION_TIME.log(Level.FINE, "{0}, {1}, {2}, {3}", new Object[]{
DebugUtil.Timer.elapsedToString(elapsedTimeMain + elapsedTimeFallback),
DebugUtil.Timer.elapsedToString(elapsedTimeMain),
DebugUtil.Timer.elapsedToString(elapsedTimeFallback),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import com.oracle.truffle.regex.RegexSyntaxException;
import com.oracle.truffle.regex.tregex.parser.RegexLexer;
import com.oracle.truffle.regex.tregex.parser.Token;
import com.oracle.truffle.regex.tregex.string.Encodings;

/**
* Generates a "unified" regular expression representation where all single characters are replaced
Expand All @@ -62,7 +63,7 @@ public final class RegexUnifier {

public RegexUnifier(RegexSource source) {
this.source = source;
this.lexer = new RegexLexer(source, RegexFlags.parseFlags(source.getFlags()), RegexOptions.DEFAULT);
this.lexer = new RegexLexer(source, RegexFlags.parseFlags(source.getFlags()), Encodings.UTF_32, RegexOptions.DEFAULT);
this.dump = new StringBuilder(source.getPattern().length());
}

Expand Down
Loading

0 comments on commit 9200dff

Please sign in to comment.