Skip to content

Commit

Permalink
TRegex: Do not merge non-deterministic prefixes
Browse files Browse the repository at this point in the history
That can lead to a reordering of the branches
and change the semantics of the regex.
  • Loading branch information
jirkamarsik committed Feb 22, 2021
1 parent df8c7ce commit 955d0d6
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,9 @@ public void gr29379() {
test("(?=^)|(?=$)", "", "", 0, true, 0, 0);
test("(?=^)|(?=$)(?=^)|(?=$)", "", "", 0, true, 0, 0);
}

@Test
public void gr29388() {
test(".+(?=bar)|.+", "", "foobar", 0, true, 0, 3);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -924,11 +924,22 @@ private void mergeCommonPrefixes(Group group) {
return;
}
ArrayList<Sequence> newAlternatives = null;
// This optimization could change the order in which different paths are explored during
// backtracking and therefore change the semantics of the expression. After the
// optimization, the resulting regex will first try to match the prefix and then try to
// continue with any of the possible suffixes before backtracking and trying different
// branches in the prefix. However, the original regex will first completely exhaust all
// options in the prefix while retaining the first suffix.
// See the example: /.+(?=bar)|.+/.exec("foobar"), in which it is important to try to
// backtrack the .+ prefix to just "foo" and then satisfy the lookahead.
// In order to handle this, we limit this optimization so that only deterministic prefixes
// are considered.
IsDeterministicVisitor isDeterministicVisitor = new IsDeterministicVisitor();
int lastEnd = 0;
int begin = 0;
while (begin + 1 < group.size()) {
int end = findMatchingAlternatives(group, begin);
if (end < 0) {
if (end < 0 || !isDeterministicVisitor.isDeterministic(group.getAlternatives().get(begin).getFirstTerm())) {
begin++;
} else {
if (newAlternatives == null) {
Expand All @@ -939,7 +950,7 @@ private void mergeCommonPrefixes(Group group) {
}
lastEnd = end;
int prefixSize = 1;
while (alternativesAreEqualAt(group, begin, end, prefixSize)) {
while (alternativesAreEqualAt(group, begin, end, prefixSize) && isDeterministicVisitor.isDeterministic(group.getAlternatives().get(begin).get(prefixSize))) {
prefixSize++;
}
Sequence prefixSeq = ast.createSequence();
Expand Down Expand Up @@ -1010,6 +1021,52 @@ private void mergeCommonPrefixes(Group group) {
}
}

private static final class IsDeterministicVisitor extends DepthFirstTraversalRegexASTVisitor {

private boolean result;

public boolean isDeterministic(RegexASTNode node) {
result = true;
run(node);
return result;
}

private static boolean hasNonDeterministicQuantifier(QuantifiableTerm term) {
if (term.hasQuantifier()) {
Token.Quantifier quantifier = term.getQuantifier();
if (quantifier.getMin() != quantifier.getMax()) {
return true;
}
}
return false;
}

@Override
protected void visit(BackReference backReference) {
if (hasNonDeterministicQuantifier(backReference)) {
result = false;
}
}

@Override
protected void visit(Group group) {
if (hasNonDeterministicQuantifier(group)) {
result = false;
return;
}
if (group.getAlternatives().size() > 1) {
result = false;
}
}

@Override
protected void visit(CharacterClass characterClass) {
if (hasNonDeterministicQuantifier(characterClass)) {
result = false;
}
}
}

/**
* Returns {@code true} iff the term at index {@code iTerm} of all alternatives in {@code group}
* from index {@code altBegin} (inclusive) to {@code altEnd} (exclusive) are semantically equal.
Expand Down

0 comments on commit 955d0d6

Please sign in to comment.