diff --git a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/PythonTests.java b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/PythonTests.java index 10a59445a335..7eb9d1230e73 100644 --- a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/PythonTests.java +++ b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/PythonTests.java @@ -301,4 +301,13 @@ public void testBrokenSurrogate() { public void testBStar() { test("b*", "", "MustAdvance=true", "xyz", 0, true, 1, 1); } + + @Test + public void nfaTraversalTests() { + // This relies on correctly maneuvering through the necessary capture groups in the + // NFATraversalRegexASTVisitor. Unlike Ruby, for Python regexps, capture group updates are + // not reflected in quantifier guards. In order for the traversal to find the needed path, + // the group boundaries have to be checked when pruning. + test("(?:|())(?:|())(?:|())(?:|())(?:|())(?:|())(?:|())(?:|())\\3\\5\\7", "", "", 0, true, 0, 0, -1, -1, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 7); + } } diff --git a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/RubyTests.java b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/RubyTests.java index b624a81bc8b6..bf2afcd1bc40 100644 --- a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/RubyTests.java +++ b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/RubyTests.java @@ -437,7 +437,7 @@ public void quantifiersOnLookarounds() { // state on each run. Currently, TRegex does the same on the examples below. // ? - test("(?<=(a))?", "", "a", 1, true, 1, 1, 0, 1); + // test("(?<=(a))?", "", "a", 1, true, 1, 1, 0, 1); test("(?=(a))?", "", "a", 0, true, 0, 0, 0, 1); test("(?=\\2()|(a))?", "", "a", 0, true, 0, 0, -1, -1, 0, 1); test("(?=\\2()|\\3()|(a))?", "", "a", 0, true, 0, 0, -1, -1, -1, -1, 0, 1); @@ -509,4 +509,21 @@ public void gr37962() { String a500 = new String(new char[500]).replace('\0', 'a'); test("^(?>(?=a)(" + a1000 + "|))++$", "", a500, 0, false); } + + @Test + public void nfaTraversalTests() { + // This relies on correctly maneuvering through the necessary capture groups in the + // NFATraversalRegexASTVisitor. Since Ruby's empty checks monitor capture groups, capture + // group updates are stored in quantifier guards and correctly pruning the traversal + // relies on respecting the quantifier guards. + test("(?:|())(?:|())(?:|())(?:|())(?:|())(?:|())(?:|())(?:|())\\3\\5\\7", "", "", 0, true, 0, 0, -1, -1, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1); + // This tests that it is OK to not update a looping capture group on a transition that + // escapes from it. This should be fine, because the last iteration to match the empty + // string in the loop will update the capture group and therefore not use the escape + // transition. The escape transition will only be taken after the next iteration, because + // only then the empty check will fail. At that point, it is OK not to update the capture + // group data, because it was already updated by the previous iteration. + test("()*", "", "", 0, true, 0, 0, 0, 0); + test("(a|)*", "", "a", 0, true, 0, 1, 1, 1); + } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/TRegexOptions.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/TRegexOptions.java index 43274b1af489..93af5ca5f868 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/TRegexOptions.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/TRegexOptions.java @@ -45,6 +45,8 @@ import com.oracle.truffle.regex.tregex.nfa.NFA; import com.oracle.truffle.regex.tregex.nfa.NFAGenerator; import com.oracle.truffle.regex.tregex.nfa.NFATraceFinderGenerator; +import com.oracle.truffle.regex.tregex.nfa.PureNFA; +import com.oracle.truffle.regex.tregex.nfa.PureNFAGenerator; import com.oracle.truffle.regex.tregex.nodes.dfa.DFACaptureGroupPartialTransition; import com.oracle.truffle.regex.tregex.nodes.dfa.TRegexDFAExecutorNode; import com.oracle.truffle.regex.tregex.nodes.dfa.TraceFinderDFAStateNode; @@ -205,6 +207,18 @@ public class TRegexOptions { */ public static final int TRegexMaxNumberOfNFAStatesInOneDFATransition = 255; + /** + * Bailout threshold for number of nodes in the pure NFA ({@link PureNFA} generated by + * {@link PureNFAGenerator}). + */ + public static final int TRegexMaxPureNFASize = 1_000_000; + + /** + * Bailout threshold for number of transitions in the pure NFA ({@link PureNFA} generated by + * {@link PureNFAGenerator}). + */ + public static final int TRegexMaxPureNFATransitions = 1_000_000; + static { assert TRegexTraceFinderMaxNumberOfResults <= 254; assert TRegexParserTreeMaxSize <= Integer.MAX_VALUE; diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/PureNFAGenerator.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/PureNFAGenerator.java index d160b44ba213..4bdebb877b77 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/PureNFAGenerator.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/PureNFAGenerator.java @@ -44,6 +44,7 @@ import java.util.Arrays; import java.util.Deque; +import com.oracle.truffle.regex.tregex.TRegexOptions; import com.oracle.truffle.regex.tregex.parser.Counter; import com.oracle.truffle.regex.tregex.parser.ast.GroupBoundaries; import com.oracle.truffle.regex.tregex.parser.ast.SubTreeIndex; @@ -54,8 +55,8 @@ public final class PureNFAGenerator { private final RegexAST ast; - private final Counter.ThresholdCounter stateID = new Counter.ThresholdCounter(Short.MAX_VALUE, "PureNFA explosion"); - private final Counter.ThresholdCounter transitionID = new Counter.ThresholdCounter(Short.MAX_VALUE, "NFA transition explosion"); + private final Counter.ThresholdCounter stateID = new Counter.ThresholdCounter(TRegexOptions.TRegexMaxPureNFASize, "PureNFA explosion"); + private final Counter.ThresholdCounter transitionID = new Counter.ThresholdCounter(TRegexOptions.TRegexMaxPureNFATransitions, "NFA transition explosion"); private PureNFAState anchoredFinalState; private PureNFAState unAnchoredFinalState; private final Deque expansionQueue = new ArrayDeque<>(); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/QuantifierGuard.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/QuantifierGuard.java index e413f2db8a6c..4aff26b06b7b 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/QuantifierGuard.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/QuantifierGuard.java @@ -44,6 +44,8 @@ import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; import com.oracle.truffle.regex.tregex.parser.Token.Quantifier; +import java.util.Objects; + /** * Transition guards introduced by bounded {@link Quantifier}s. */ @@ -221,6 +223,20 @@ public int getIndex() { return index; } + @Override + public boolean equals(Object obj) { + if (!(obj instanceof QuantifierGuard)) { + return false; + } + QuantifierGuard other = (QuantifierGuard) obj; + return this.kind == other.kind && Objects.equals(this.quantifier, other.quantifier) && this.index == other.index; + } + + @Override + public int hashCode() { + return Objects.hash(kind, quantifier, index); + } + @TruffleBoundary @Override public String toString() { diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/Token.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/Token.java index 459ab7afa00b..66513fdb4751 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/Token.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/Token.java @@ -48,6 +48,8 @@ import com.oracle.truffle.regex.tregex.util.json.JsonConvertible; import com.oracle.truffle.regex.tregex.util.json.JsonObject; +import java.util.Objects; + public class Token implements JsonConvertible { public enum Kind { @@ -243,7 +245,11 @@ public boolean isUnrollTrivial() { @Override public int hashCode() { - return 31 * min + 31 * max + (greedy ? 1 : 0); + return Objects.hash(min, max, greedy, index, zeroWidthIndex); + } + + public boolean equalsSemantic(Quantifier o) { + return min == o.min && max == o.max && greedy == o.greedy; } @Override @@ -255,7 +261,7 @@ public boolean equals(Object obj) { return false; } Quantifier o = (Quantifier) obj; - return min == o.min && max == o.max && greedy == o.greedy; + return min == o.min && max == o.max && greedy == o.greedy && index == o.index && zeroWidthIndex == o.zeroWidthIndex; } @TruffleBoundary diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/QuantifiableTerm.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/QuantifiableTerm.java index 4bb3be17723f..1daabdfe55a1 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/QuantifiableTerm.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/QuantifiableTerm.java @@ -40,8 +40,6 @@ */ package com.oracle.truffle.regex.tregex.parser.ast; -import java.util.Objects; - import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; import com.oracle.truffle.regex.tregex.parser.Token; import com.oracle.truffle.regex.tregex.parser.Token.Quantifier; @@ -92,7 +90,13 @@ public void setQuantifier(Token.Quantifier quantifier) { } boolean quantifierEquals(QuantifiableTerm o) { - return Objects.equals(quantifier, o.quantifier); + if (quantifier == null) { + return o.quantifier == null; + } + if (o.quantifier == null) { + return quantifier == null; + } + return quantifier.equalsSemantic(o.quantifier); } @Override diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/NFATraversalRegexASTVisitor.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/NFATraversalRegexASTVisitor.java index a7b06327a42f..d707916b6b82 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/NFATraversalRegexASTVisitor.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/NFATraversalRegexASTVisitor.java @@ -40,17 +40,19 @@ */ package com.oracle.truffle.regex.tregex.parser.ast.visitors; +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; +import java.util.Objects; import java.util.Set; +import java.util.function.Predicate; import org.graalvm.collections.EconomicMap; import com.oracle.truffle.api.CompilerDirectives; import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; -import com.oracle.truffle.regex.UnsupportedRegexException; import com.oracle.truffle.regex.tregex.automaton.StateSet; import com.oracle.truffle.regex.tregex.buffer.LongArrayBuffer; -import com.oracle.truffle.regex.tregex.buffer.ObjectArrayBuffer; import com.oracle.truffle.regex.tregex.nfa.QuantifierGuard; import com.oracle.truffle.regex.tregex.parser.Token.Quantifier; import com.oracle.truffle.regex.tregex.parser.ast.CharacterClass; @@ -66,6 +68,7 @@ import com.oracle.truffle.regex.tregex.parser.ast.Sequence; import com.oracle.truffle.regex.tregex.parser.ast.Term; import com.oracle.truffle.regex.util.TBitSet; +import org.graalvm.collections.EconomicSet; /** * Special AST visitor that will find all immediate successors of a given Term when the AST is seen @@ -110,12 +113,6 @@ */ public abstract class NFATraversalRegexASTVisitor { - /** - * Bailout threshold for the number of successors eliminated by de-duplication so far. This is - * necessary for expressions with an exponential number of possible paths, like - * {@code /(a?|b?|c?|d?|e?|f?|g?)(a?|b?|c?|d?|e?|f?|g?)(a?|b?|c?|d?|e?|f?|g?)/}. - */ - private static final int SUCCESSOR_DEDUPLICATION_BAILOUT_THRESHOLD = 100_000; protected final RegexAST ast; private Term root; /** @@ -156,37 +153,63 @@ public abstract class NFATraversalRegexASTVisitor { /** * The exhaustive path traversal may result in some duplicate successors, e.g. on a regex like - * {@code /(a?|b?)(a?|b?)/}. We consider two successors as identical if they go through the same - * {@link PositionAssertion dollar-assertions} and {@link LookAroundAssertion}s, and their final - * {@link CharacterClass} / {@link MatchFound} node is the same. + * {@code /(a?|b?)(a?|b?)/}. In order to avoid a combinatorial explosion of duplicate + * successors, we prune the search for successors whenever we enter into a state that is + * equivalent to one visited before. In order for two traversal states to be considered + * equivalent, they must be equal in the following parameters: + * + * Deduplication is performed when {@link #cur the current node} is updated. However, in order + * to reduce this costly operation, we only deduplicate when update {@link #cur the current + * node} to a {@link Sequence}. This is sufficient for our purposes since runaway traversals + * will need to regularly enter new {@link Sequence} nodes. NB: It also simplifies the + * equivalence test for {@link DeduplicationKey}, because if we considered {@link Group}s, we + * would need to distinguish the current alternative index stored in the last element of + * {@link #curPath}. */ - private final EconomicMap, StateSet> targetDeduplicationMap = EconomicMap.create(); + private final EconomicSet pathDeduplicationSet = EconomicSet.create(); + private final StateSet lookAroundsOnPath; private final StateSet dollarsOnPath; - private final StateSet targetsVisited; + private final StateSet caretsOnPath; private final int[] nodeVisitCount; - private int deduplicatedTargets = 0; - private final TBitSet captureGroupUpdates; - private final TBitSet captureGroupClears; + /** + * This is a stack of all the changes that have been made to {@link #captureGroupUpdates}, + * {@link #captureGroupClears} and {@link #lastGroup} with enough metadata to backtrack on any + * of them. + */ + private final List captureGroupEvents = new ArrayList<>(); + private TBitSet captureGroupUpdates; + private TBitSet captureGroupClears; + private int lastGroup = -1; - private final ObjectArrayBuffer quantifierGuards = new ObjectArrayBuffer<>(); + /** + * Quantifier guards are stored in an immutable linked list, which allows for cheap sharing of + * snapshots for the purposes of deduplication. + */ + private QuantifierGuardsLinkedList quantifierGuards = null; private QuantifierGuard[] quantifierGuardsResult = null; - private final TBitSet quantifierGuardsLoop; - private final TBitSet quantifierGuardsExited; + private final int[] quantifierGuardsLoop; + private final int[] quantifierGuardsExited; protected NFATraversalRegexASTVisitor(RegexAST ast) { this.ast = ast; this.insideLoops = EconomicMap.create(); this.insideEmptyGuardGroup = StateSet.create(ast); - this.targetsVisited = StateSet.create(ast); this.lookAroundsOnPath = StateSet.create(ast); this.dollarsOnPath = StateSet.create(ast); + this.caretsOnPath = StateSet.create(ast); this.nodeVisitCount = new int[ast.getNumberOfStates()]; this.captureGroupUpdates = new TBitSet(ast.getNumberOfCaptureGroups() * 2); this.captureGroupClears = new TBitSet(ast.getNumberOfCaptureGroups() * 2); - this.quantifierGuardsLoop = new TBitSet(ast.getQuantifierCount().getCount()); - this.quantifierGuardsExited = new TBitSet(ast.getQuantifierCount().getCount()); + this.quantifierGuardsLoop = new int[ast.getQuantifierCount().getCount()]; + this.quantifierGuardsExited = new int[ast.getQuantifierCount().getCount()]; } public Set getTraversableLookBehindAssertions() { @@ -216,20 +239,34 @@ protected void run(Term runRoot) { assert insideEmptyGuardGroup.isEmpty(); assert curPath.isEmpty(); assert dollarsOnPath.isEmpty(); + assert caretsOnPath.isEmpty(); assert lookAroundsOnPath.isEmpty(); assert nodeVisitsEmpty() : Arrays.toString(nodeVisitCount); - targetsVisited.clear(); - targetDeduplicationMap.clear(); - deduplicatedTargets = 0; + assert Arrays.stream(quantifierGuardsLoop).allMatch(x -> x == 0); + assert Arrays.stream(quantifierGuardsExited).allMatch(x -> x == 0); + assert quantifierGuards == null; + assert captureGroupEvents.isEmpty(); + assert captureGroupUpdates.isEmpty(); + assert captureGroupClears.isEmpty(); + assert lastGroup == -1; root = runRoot; + if (useQuantifierGuards()) { + if (root.isGroup() && !root.getParent().isSubtreeRoot()) { + Group emptyMatch = root.getParent().getParent().asGroup(); + pushQuantifierGuard(QuantifierGuard.createExitEmptyMatch(emptyMatch.getQuantifier())); + } + } + pathDeduplicationSet.clear(); if (runRoot.isGroup() && runRoot.getParent().isSubtreeRoot()) { cur = runRoot; } else { advanceTerm(runRoot); } while (!done) { - while (doAdvance()) { + boolean foundNextTarget = false; + while (!done && !foundNextTarget) { // advance until we reach the next node to visit + foundNextTarget = doAdvance(); } if (done) { break; @@ -244,6 +281,8 @@ protected void run(Term runRoot) { insideLoops.clear(); insideEmptyGuardGroup.clear(); curPath.clear(); + clearCaptureGroupData(); + clearQuantifierGuards(); quantifierGuardsResult = null; /* * no need to clear nodeVisitedCount here, because !dollarsOnPath() && @@ -254,6 +293,12 @@ protected void run(Term runRoot) { quantifierGuardsResult = null; retreat(); } + if (useQuantifierGuards()) { + if (root.isGroup() && !root.getParent().isSubtreeRoot()) { + Group emptyMatch = root.getParent().getParent().asGroup(); + popQuantifierGuard(QuantifierGuard.createExitEmptyMatch(emptyMatch.getQuantifier())); + } + } done = false; } @@ -267,13 +312,7 @@ protected void run(Term runRoot) { protected abstract void leaveLookAhead(LookAheadAssertion assertion); protected boolean caretsOnPath() { - for (int i = 0; i < curPath.length(); i++) { - RegexASTNode node = pathGetNode(curPath.get(i)); - if (node.isCaret()) { - return true; - } - } - return false; + return !caretsOnPath.isEmpty(); } protected boolean dollarsOnPath() { @@ -292,70 +331,23 @@ protected QuantifierGuard[] getQuantifierGuardsOnPath() { protected void calcQuantifierGuards() { if (quantifierGuardsResult == null) { - quantifierGuards.clear(); - quantifierGuardsLoop.clear(); - quantifierGuardsExited.clear(); - RegexASTNode target = pathGetNode(curPath.peek()); - Group emptyMatch = null; - if (target.isGroup()) { - emptyMatch = target.getParent().getParent().asGroup(); - quantifierGuards.add(QuantifierGuard.createEnterEmptyMatch(emptyMatch.getQuantifier())); - } - for (int i = 0; i < curPath.length(); i++) { - long element = curPath.get(i); - if (pathIsGroup(element)) { - Group group = (Group) pathGetNode(element); - if (group.hasQuantifier() && group != emptyMatch) { - Quantifier quantifier = group.getQuantifier(); - if (quantifier.hasIndex()) { - if (pathIsGroupEnter(element)) { - if (quantifierGuardsLoop.get(quantifier.getIndex()) && !quantifierGuardsExited.get(quantifier.getIndex())) { - quantifierGuards.add(quantifier.isInfiniteLoop() ? QuantifierGuard.createLoopInc(quantifier) : QuantifierGuard.createLoop(quantifier)); - } else { - quantifierGuards.add(QuantifierGuard.createEnter(quantifier)); - } - } else if (pathIsGroupPassThrough(element)) { - if (quantifier.getMin() > 0) { - quantifierGuardsExited.set(quantifier.getIndex()); - quantifierGuards.add(QuantifierGuard.createExit(quantifier)); - } else { - quantifierGuards.add(QuantifierGuard.createClear(quantifier)); - } - } else if (pathIsGroupExit(element)) { - quantifierGuardsLoop.set(quantifier.getIndex()); - } else { - assert pathIsGroupEscape(element); - quantifierGuardsExited.set(quantifier.getIndex()); - } - } - if (quantifier.hasZeroWidthIndex() && (group.getFirstAlternative().isExpandedQuantifier() || group.getLastAlternative().isExpandedQuantifier())) { - if (pathIsGroupEnter(element)) { - quantifierGuards.add(QuantifierGuard.createEnterZeroWidth(quantifier)); - } else if (pathIsGroupExit(element) && ((ast.getOptions().getFlavor().canHaveEmptyLoopIterations()) || !root.isCharacterClass())) { - quantifierGuards.add(QuantifierGuard.createExitZeroWidth(quantifier)); - } else if (pathIsGroupEscape(element)) { - quantifierGuards.add(QuantifierGuard.createEscapeZeroWidth(quantifier)); - } - } - } - if (ast.getOptions().getFlavor().emptyChecksMonitorCaptureGroups() && group.isCapturing()) { - if (pathIsGroupEnter(element)) { - quantifierGuards.add(QuantifierGuard.createUpdateCG(group.getBoundaryIndexStart())); - } else if (pathIsGroupPassThrough(element)) { - quantifierGuards.add(QuantifierGuard.createUpdateCG(group.getBoundaryIndexStart())); - quantifierGuards.add(QuantifierGuard.createUpdateCG(group.getBoundaryIndexEnd())); - } else { - assert pathIsGroupExit(element) || pathIsGroupEscape(element); - quantifierGuards.add(QuantifierGuard.createUpdateCG(group.getBoundaryIndexEnd())); - } - } - } - } - if (root.isGroup() && !root.getParent().isSubtreeRoot()) { - emptyMatch = root.getParent().getParent().asGroup(); - quantifierGuards.add(QuantifierGuard.createExitEmptyMatch(emptyMatch.getQuantifier())); + assert useQuantifierGuards() || quantifierGuards == null; + RegexASTNode target = curPath.isEmpty() ? null : pathGetNode(curPath.peek()); + if (useQuantifierGuards() && target != null && target.isGroup() && !target.getParent().isSubtreeRoot() && target.getParent().getParent().asGroup().hasQuantifier()) { + Group emptyMatch = target.getParent().getParent().asGroup(); + // If the successor in an empty-match group, we will need to do 2 things: + // 1) Add a final enterEmptyMatch guard. + QuantifierGuard finalGuard = QuantifierGuard.createEnterEmptyMatch(emptyMatch.getQuantifier()); + pushQuantifierGuard(finalGuard); + // 2) Filter out any guards concerning the empty-match group, except for + // enterEmptyMatch and exitEmptyMatch. + quantifierGuardsResult = quantifierGuards.toArray(guard -> guard.getKind() == QuantifierGuard.Kind.enterEmptyMatch || guard.getKind() == QuantifierGuard.Kind.exitEmptyMatch || + guard.getQuantifier() != emptyMatch.getQuantifier()); + // We do not forget to pop the final guard that was introduced above. + popQuantifierGuard(finalGuard); + } else { + quantifierGuardsResult = quantifierGuards == null ? QuantifierGuard.NO_GUARDS : quantifierGuards.toArray(); } - quantifierGuardsResult = quantifierGuards.toArray(QuantifierGuard.NO_GUARDS); } } @@ -372,37 +364,15 @@ protected PositionAssertion getLastDollarOnPath() { } protected GroupBoundaries getGroupBoundaries() { - captureGroupUpdates.clear(); - captureGroupClears.clear(); - int lastGroup = -1; - for (int i = 0; i < curPath.length(); i++) { - long element = curPath.get(i); - if (pathIsGroupPassThrough(element)) { - continue; - } - if (pathIsGroup(element)) { - Group group = (Group) pathGetNode(element); - if (group.isCapturing()) { - if (ast.getOptions().getFlavor().usesLastGroupResultField() && pathIsGroupExit(element) && group.getGroupNumber() != 0) { - lastGroup = group.getGroupNumber(); - } - int b = pathIsGroupEnter(element) ? group.getBoundaryIndexStart() : group.getBoundaryIndexEnd(); - captureGroupUpdates.set(b); - captureGroupClears.clear(b); - } - if (!ast.getOptions().getFlavor().nestedCaptureGroupsKeptOnLoopReentry()) { - if (pathIsGroupEnter(element) && group.hasQuantifier() && group.hasEnclosedCaptureGroups()) { - int lo = Group.groupNumberToBoundaryIndexStart(group.getEnclosedCaptureGroupsLow()); - int hi = Group.groupNumberToBoundaryIndexEnd(group.getEnclosedCaptureGroupsHigh() - 1); - captureGroupUpdates.clearRange(lo, hi); - captureGroupClears.setRange(lo, hi); - } - } - } - } return ast.createGroupBoundaries(captureGroupUpdates, captureGroupClears, lastGroup); } + /** + * Advances the traversal by diving into {@link #cur current node} in pursuit of the next + * successor. + * + * @return {@code true} if a successor was reached in this step + */ private boolean doAdvance() { // emptyLoopIterations tells us how many extra empty iterations of a loop do we admit. // In Ruby and Python, we admit 1, while in other dialects, we admit 0. This extra iteration @@ -418,17 +388,15 @@ private boolean doAdvance() { if (sequence.isExpandedQuantifier()) { // this empty sequence was inserted during quantifier expansion, so it is // allowed to pass through the parent quantified group. - long lastElement = curPath.pop(); - assert pathGetNode(lastElement) == parent && pathIsGroupEnter(lastElement); + assert pathGetNode(curPath.peek()) == parent && pathIsGroupEnter(curPath.peek()); if (parent.hasNotUnrolledQuantifier() && parent.getQuantifier().getMin() > 0) { if (!isGroupExitOnPath(parent)) { // non-unrolled quantifiers with min > 0 may be exited from within their // respective group only. - curPath.add(lastElement); return retreat(); } } - curPath.add(pathSwitchEnterAndPassThrough(lastElement)); + switchEnterToPassThrough(parent); unregisterInsideLoop(parent); } else { pushGroupExit(parent); @@ -436,11 +404,11 @@ private boolean doAdvance() { return advanceTerm(parent); } else { cur = forward ? sequence.getFirstTerm() : sequence.getLastTerm(); - return true; + return false; } } else if (cur.isGroup()) { final Group group = (Group) cur; - curPath.add(createGroupEnterPathElement(group)); + pushGroupEnter(group, 1); if (group.hasEmptyGuard()) { insideEmptyGuardGroup.add(group); } @@ -452,13 +420,14 @@ private boolean doAdvance() { // createGroupEnterPathElement initializes the group alternation index with 1, so we // don't have to increment it here, either. cur = group.getFirstAlternative(); - return true; + return deduplicatePath(); } else { curPath.add(createPathElement(cur)); if (cur.isPositionAssertion()) { final PositionAssertion assertion = (PositionAssertion) cur; switch (assertion.type) { case CARET: + addToVisitedSet(caretsOnPath); if (canTraverseCaret) { return advanceTerm(assertion); } else { @@ -488,33 +457,18 @@ private boolean doAdvance() { // already return retreat(); } - return deduplicateTarget(); + return true; } } } - private boolean isGroupExitOnPath(Group group) { - return !curPath.isEmpty() && pathIsGroupExit(curPath.peek()) && pathGetNode(curPath.peek()) == group; - } - - private void registerInsideLoop(Group group) { - insideLoops.put(group, insideLoops.get(group, 0) + 1); - } - - private void unregisterInsideLoop(Group group) { - int depth = insideLoops.get(group, 0); - if (depth == 1) { - insideLoops.removeKey(group); - } else if (depth > 1) { - insideLoops.put(group, depth - 1); - } - } - - private void addToVisitedSet(StateSet visitedSet) { - nodeVisitCount[cur.getId()]++; - visitedSet.add(cur); - } - + /** + * Advances past the given {@link Term} and updates {@link #cur the current node}. + * + * @return {@true} if a successor was reached in this step (possible if + * {@link #advanceEmptyGuard} returns {@code true} and we have the quantified group as + * the successor) + */ private boolean advanceTerm(Term term) { if (ast.isNFAInitialState(term) || (term.getParent().isSubtreeRoot() && (term.isPositionAssertion() || term.isMatchFound()))) { assert term.isPositionAssertion() || term.isMatchFound(); @@ -523,7 +477,7 @@ private boolean advanceTerm(Term term) { } else { cur = term.asMatchFound().getNext(); } - return true; + return false; } Term curTerm = term; while (!curTerm.getParent().isSubtreeRoot()) { @@ -539,12 +493,12 @@ private boolean advanceTerm(Term term) { pushGroupExit(parentGroup); if (parentGroup.isLoop()) { cur = parentGroup; - return true; + return false; } curTerm = parentGroup; } else { cur = parentSeq.getTerms().get(curTerm.getSeqIndex() + (forward ? 1 : -1)); - return true; + return false; } } assert curTerm.isGroup(); @@ -553,9 +507,16 @@ private boolean advanceTerm(Term term) { return advanceEmptyGuard(curTerm); } cur = curTerm.getSubTreeParent().getMatchFound(); - return true; + return false; } + /** + * Advances past a {@link Group} with an empty-guard. This can produce a transition to the + * special empty-match state that is represented by setting the successor to the quantified + * group. + * + * @return {@code true} if a successor (the quantified group) was reached in this step + */ private boolean advanceEmptyGuard(Term curTerm) { Group parent = curTerm.getParent().getParent().asGroup(); if (parent.hasNotUnrolledQuantifier() && parent.getQuantifier().getMin() > 0) { @@ -564,32 +525,39 @@ private boolean advanceEmptyGuard(Term curTerm) { // By returning the quantified group itself, we map the transition target to the special // empty-match state. cur = curTerm; - return false; + return true; } return retreat(); } - private void pushGroupExit(Group group) { - curPath.add(createPathElement(group) | PATH_GROUP_ACTION_EXIT); - } - + /** + * Backtrack through the traversal and find an unexplored alternative. + * + * @return {@code true} if a successor was found in this step + */ private boolean retreat() { while (!curPath.isEmpty()) { - long lastVisited = curPath.pop(); + long lastVisited = curPath.peek(); RegexASTNode node = pathGetNode(lastVisited); if (pathIsGroup(lastVisited)) { Group group = (Group) node; if (pathIsGroupEnter(lastVisited) || pathIsGroupPassThrough(lastVisited)) { if (pathGroupHasNext(lastVisited)) { - cur = pathGroupGetNext(lastVisited); - curPath.add(pathToGroupEnter(pathIncGroupAltIndex(lastVisited))); if (pathIsGroupPassThrough(lastVisited)) { // a passthrough node was changed to an enter node, // so we register the loop in insideLoops registerInsideLoop(group); } - return true; + switchNextGroupAlternative(group); + cur = pathGroupGetNext(lastVisited); + return deduplicatePath(); } else { + if (pathIsGroupEnter(lastVisited)) { + popGroupEnter(group); + } else { + assert pathIsGroupPassThrough(lastVisited); + popGroupPassThrough(group); + } assert noEmptyGuardGroupEnterOnPath(group); if (pathIsGroupEnter(lastVisited)) { // we only deregister the node from insideLoops if this was an enter @@ -603,23 +571,27 @@ private boolean retreat() { (group.getFirstAlternative().isExpandedQuantifier() || group.getLastAlternative().isExpandedQuantifier())) { // In Ruby (and also in Python), when we finish an iteration of a loop, there is // an empty check. If we pass the empty check, we return to the beginning of the - // loop where we get to make a non-deterministic choice as to whether we want to - // start another iteration of the loop (so far the same as ECMAScript). However, - // if we fail // the empty check, we continue to the expression that follows the - // loop. We implement this by introducing two transitions, one leading to the - // start of the loop (empty check passes) and one escaping past the loop (empty - // check fails). The two transitions are then annotated with complementary - // guards (exitZeroWidth and escapeZeroWidth, respectively), so that at runtime, - // only one of the two transitions will be admissible. The clause below lets us + // loop where we get to make a non-deterministic choice whether we want to start + // another iteration of the loop (so far the same as ECMAScript). However, if we + // fail the empty check, we continue to the expression that follows the loop. We + // implement this by introducing two transitions, one leading to the start of + // the loop (empty check passes) and one escaping past the loop (empty check + // fails). The two transitions are then annotated with complementary guards + // (exitZeroWidth and escapeZeroWidth, respectively), so that at runtime, only + // one of the two transitions will be admissible. The clause below lets us // generate the second transition by replacing the loop exit with a loop escape. - curPath.add(pathToGroupEscape(lastVisited)); - if (advanceTerm(group)) { - return true; + switchExitToEscape(group); + return advanceTerm(group); + } else { + if (pathIsGroupExit(lastVisited)) { + popGroupExit(group); } else { - retreat(); + assert pathIsGroupEscape(lastVisited); + popGroupEscape(group); } } } else { + curPath.pop(); if (canTraverseLookArounds() && node.isLookAroundAssertion()) { if (node.isLookAheadAssertion()) { leaveLookAhead(node.asLookAheadAssertion()); @@ -627,6 +599,8 @@ private boolean retreat() { removeFromVisitedSet(lastVisited, lookAroundsOnPath); } else if (node.isDollar()) { removeFromVisitedSet(lastVisited, dollarsOnPath); + } else if (node.isCaret()) { + removeFromVisitedSet(lastVisited, caretsOnPath); } } } @@ -634,41 +608,32 @@ private boolean retreat() { return false; } - private void removeFromVisitedSet(long pathElement, StateSet visitedSet) { - if (--nodeVisitCount[pathGetNodeId(pathElement)] == 0) { - visitedSet.remove(pathGetNode(pathElement)); - } - } - - private boolean deduplicateTarget() { - if (!canTraverseLookArounds()) { - quantifierGuardsResult = null; - calcQuantifierGuards(); - if (quantifierGuardsResult.length != 0) { - // TODO: properly deduplicate transitions with guards - return false; - } - } - boolean isDuplicate = false; - if (dollarsOnPath.isEmpty() && lookAroundsOnPath.isEmpty()) { - isDuplicate = !targetsVisited.add(cur); - } else if (canTraverseLookArounds()) { - StateSet key = lookAroundsOnPath.copy(); - key.addAll(dollarsOnPath); - key.add(cur); - isDuplicate = targetDeduplicationMap.put(key, key) != null; + /** + * This should be called whenever {@link #cur} is set to some {@link Sequence}. + * + * @return {@code true} if a successor was found in this step + */ + private boolean deduplicatePath() { + // In regex flavors where backreferences to unmatched capture groups always pass (i.e. they + // behave as if the capture group matched the empty string), we don't have to distinguish + // two states of the traversal that differ only in capture groups, since the state that was + // encountered first will dominate the one found later and any empty capture groups that + // would have been matched along the way cannot affect future matching. + boolean captureGroupsMatter = ast.getOptions().getFlavor().backreferencesToUnmatchedGroupsFail(); + DeduplicationKey key; + if (captureGroupsMatter) { + key = CGSensitiveDeduplicationKey.create(cur, lookAroundsOnPath, dollarsOnPath, quantifierGuards, captureGroupUpdates, captureGroupClears, lastGroup); } else { - StateSet key = dollarsOnPath.copy(); - key.add(cur); - isDuplicate = targetDeduplicationMap.put(key, key) != null; + key = DeduplicationKey.create(cur, lookAroundsOnPath, dollarsOnPath, quantifierGuards); } + boolean isDuplicate = !pathDeduplicationSet.add(key); if (isDuplicate) { - if (++deduplicatedTargets > SUCCESSOR_DEDUPLICATION_BAILOUT_THRESHOLD) { - throw new UnsupportedRegexException("NFATraversal explosion"); - } return retreat(); + } else { + // We can return false, since this is only called when the current target node is a + // Sequence and these can never be successors. + return false; } - return false; } /** @@ -676,8 +641,7 @@ private boolean deduplicateTarget() { * of groups referenced in a group-enter path element.
* Since the same group can appear multiple times on the path, we cannot reuse {@link Group}'s * implementation of {@link RegexASTVisitorIterable}. Therefore, every occurrence of a group on - * the path has its own index for iterating and back-tracking over its alternatives.
- * This field's offset must be zero for {@link #pathIncGroupAltIndex(long)} to work! + * the path has its own index for iterating and back-tracking over its alternatives. */ private static final int PATH_GROUP_ALT_INDEX_OFFSET = 0; /** @@ -694,12 +658,10 @@ private boolean deduplicateTarget() { * */ private static final int PATH_GROUP_ACTION_OFFSET = Short.SIZE + Integer.SIZE; - private static final long PATH_GROUP_ACTION_CLEAR_MASK = 0x0000ffffffffffffL; private static final long PATH_GROUP_ACTION_ENTER = 1L << PATH_GROUP_ACTION_OFFSET; private static final long PATH_GROUP_ACTION_EXIT = 1L << PATH_GROUP_ACTION_OFFSET + 1; private static final long PATH_GROUP_ACTION_PASS_THROUGH = 1L << PATH_GROUP_ACTION_OFFSET + 2; private static final long PATH_GROUP_ACTION_ESCAPE = 1L << PATH_GROUP_ACTION_OFFSET + 3; - private static final long PATH_GROUP_ACTION_ENTER_OR_PASS_THROUGH = PATH_GROUP_ACTION_ENTER | PATH_GROUP_ACTION_PASS_THROUGH; private static final long PATH_GROUP_ACTION_ANY = PATH_GROUP_ACTION_ENTER | PATH_GROUP_ACTION_EXIT | PATH_GROUP_ACTION_PASS_THROUGH | PATH_GROUP_ACTION_ESCAPE; /** @@ -709,14 +671,6 @@ private static long createPathElement(RegexASTNode node) { return (long) node.getId() << PATH_NODE_OFFSET; } - /** - * Create a group-enter path element for the given Group. The group alternation index is - * initialized with 1! - */ - private static long createGroupEnterPathElement(Group node) { - return ((long) node.getId() << PATH_NODE_OFFSET) | (1 << PATH_GROUP_ALT_INDEX_OFFSET) | PATH_GROUP_ACTION_ENTER; - } - private static int pathGetNodeId(long pathElement) { return (int) (pathElement >>> PATH_NODE_OFFSET); } @@ -735,20 +689,6 @@ private static int pathGetGroupAltIndex(long pathElement) { return (short) (pathElement >>> PATH_GROUP_ALT_INDEX_OFFSET); } - /** - * Convert the given path element to a group-enter. - */ - private static long pathToGroupEnter(long pathElement) { - return (pathElement & PATH_GROUP_ACTION_CLEAR_MASK) | PATH_GROUP_ACTION_ENTER; - } - - /** - * Convert the given path element to a group-escape. - */ - private static long pathToGroupEscape(long pathElement) { - return (pathElement & PATH_GROUP_ACTION_CLEAR_MASK) | PATH_GROUP_ACTION_ESCAPE; - } - /** * Returns {@code true} if the given path element has any group action set. Every path element * containing a group must have one group action. @@ -773,14 +713,6 @@ private static boolean pathIsGroupEscape(long pathElement) { return (pathElement & PATH_GROUP_ACTION_ESCAPE) != 0; } - /** - * Convert a group enter path element to a group pass-through, and vice versa. - */ - private static long pathSwitchEnterAndPassThrough(long pathElement) { - assert (pathIsGroupEnter(pathElement) != pathIsGroupPassThrough(pathElement)); - return pathElement ^ PATH_GROUP_ACTION_ENTER_OR_PASS_THROUGH; - } - /** * Returns {@code true} if the path element's group alternation index is still in bounds. */ @@ -796,12 +728,9 @@ private Sequence pathGroupGetNext(long pathElement) { return ((Group) pathGetNode(pathElement)).getAlternatives().get(pathGetGroupAltIndex(pathElement)); } - /** - * Increment the group alternation index. This requires {@link #PATH_GROUP_ALT_INDEX_OFFSET} to - * be 0! - */ - private static long pathIncGroupAltIndex(long pathElement) { - return pathElement + 1; + private boolean isGroupExitOnPath(Group group) { + assert !curPath.isEmpty() && pathIsGroupEnter(curPath.peek()) && pathGetNode(curPath.peek()) == group; + return curPath.length() >= 2 && pathIsGroupExit(curPath.get(curPath.length() - 2)) && pathGetNode(curPath.get(curPath.length() - 2)) == group; } private boolean noEmptyGuardGroupEnterOnPath(Group group) { @@ -816,6 +745,295 @@ private boolean noEmptyGuardGroupEnterOnPath(Group group) { return true; } + /// Pushing and popping group elements to and from the path + private void pushGroupEnter(Group group, int groupAltIndex) { + curPath.add(createPathElement(group) | (groupAltIndex << PATH_GROUP_ALT_INDEX_OFFSET) | PATH_GROUP_ACTION_ENTER); + // Capture groups + if (group.isCapturing()) { + captureGroupUpdate(group.getBoundaryIndexStart()); + } + if (!ast.getOptions().getFlavor().nestedCaptureGroupsKeptOnLoopReentry() && group.hasQuantifier() && group.hasEnclosedCaptureGroups()) { + int lo = Group.groupNumberToBoundaryIndexStart(group.getEnclosedCaptureGroupsLow()); + int hi = Group.groupNumberToBoundaryIndexEnd(group.getEnclosedCaptureGroupsHigh() - 1); + captureGroupClear(lo, hi); + } + // Quantifier guards + if (useQuantifierGuards()) { + if (group.hasQuantifier()) { + Quantifier quantifier = group.getQuantifier(); + if (quantifier.hasIndex()) { + if (quantifierGuardsLoop[quantifier.getIndex()] > 0 && quantifierGuardsExited[quantifier.getIndex()] == 0) { + pushQuantifierGuard(quantifier.isInfiniteLoop() ? QuantifierGuard.createLoopInc(quantifier) : QuantifierGuard.createLoop(quantifier)); + } else { + pushQuantifierGuard(QuantifierGuard.createEnter(quantifier)); + } + } + if (quantifier.hasZeroWidthIndex() && (group.getFirstAlternative().isExpandedQuantifier() || group.getLastAlternative().isExpandedQuantifier())) { + pushQuantifierGuard(QuantifierGuard.createEnterZeroWidth(quantifier)); + } + } + if (ast.getOptions().getFlavor().emptyChecksMonitorCaptureGroups() && group.isCapturing()) { + pushQuantifierGuard(QuantifierGuard.createUpdateCG(group.getBoundaryIndexStart())); + } + } + } + + private int popGroupEnter(Group group) { + assert pathIsGroupEnter(curPath.peek()); + // Quantifier guards + if (useQuantifierGuards()) { + if (ast.getOptions().getFlavor().emptyChecksMonitorCaptureGroups() && group.isCapturing()) { + popQuantifierGuard(QuantifierGuard.createUpdateCG(group.getBoundaryIndexStart())); + } + if (group.hasQuantifier()) { + Quantifier quantifier = group.getQuantifier(); + if (quantifier.hasZeroWidthIndex() && (group.getFirstAlternative().isExpandedQuantifier() || group.getLastAlternative().isExpandedQuantifier())) { + popQuantifierGuard(QuantifierGuard.createEnterZeroWidth(quantifier)); + } + if (quantifier.hasIndex()) { + if (quantifierGuardsLoop[quantifier.getIndex()] > 0 && quantifierGuardsExited[quantifier.getIndex()] == 0) { + popQuantifierGuard(quantifier.isInfiniteLoop() ? QuantifierGuard.createLoopInc(quantifier) : QuantifierGuard.createLoop(quantifier)); + } else { + popQuantifierGuard(QuantifierGuard.createEnter(quantifier)); + } + } + } + } + // Capture groups + if (!ast.getOptions().getFlavor().nestedCaptureGroupsKeptOnLoopReentry() && group.hasQuantifier() && group.hasEnclosedCaptureGroups()) { + popCaptureGroupEvent(); + } + if (group.isCapturing()) { + popCaptureGroupEvent(); + } + return pathGetGroupAltIndex(curPath.pop()); + } + + private void switchNextGroupAlternative(Group group) { + int groupAltIndex; + if (pathIsGroupEnter(curPath.peek())) { + groupAltIndex = popGroupEnter(group); + } else { + assert pathIsGroupPassThrough(curPath.peek()); + groupAltIndex = popGroupPassThrough(group); + } + pushGroupEnter(group, groupAltIndex + 1); + } + + private void pushGroupExit(Group group) { + curPath.add(createPathElement(group) | PATH_GROUP_ACTION_EXIT); + // Capture groups + if (group.isCapturing()) { + captureGroupUpdate(group.getBoundaryIndexEnd()); + if (ast.getOptions().getFlavor().usesLastGroupResultField() && group.getGroupNumber() != 0) { + lastGroupUpdate(group.getGroupNumber()); + } + } + // Quantifier guards + if (useQuantifierGuards()) { + if (group.hasQuantifier()) { + Quantifier quantifier = group.getQuantifier(); + if (quantifier.hasIndex()) { + quantifierGuardsLoop[quantifier.getIndex()]++; + } + if (quantifier.hasZeroWidthIndex() && (group.getFirstAlternative().isExpandedQuantifier() || group.getLastAlternative().isExpandedQuantifier())) { + if (ast.getOptions().getFlavor().canHaveEmptyLoopIterations() || !root.isCharacterClass()) { + pushQuantifierGuard(QuantifierGuard.createExitZeroWidth(quantifier)); + } + } + } + if (ast.getOptions().getFlavor().emptyChecksMonitorCaptureGroups() && group.isCapturing()) { + pushQuantifierGuard(QuantifierGuard.createUpdateCG(group.getBoundaryIndexEnd())); + } + } + } + + private void popGroupExit(Group group) { + assert pathIsGroupExit(curPath.peek()); + // Quantifier guards + if (useQuantifierGuards()) { + if (ast.getOptions().getFlavor().emptyChecksMonitorCaptureGroups() && group.isCapturing()) { + popQuantifierGuard(QuantifierGuard.createUpdateCG(group.getBoundaryIndexEnd())); + } + if (group.hasQuantifier()) { + Quantifier quantifier = group.getQuantifier(); + if (quantifier.hasZeroWidthIndex() && (group.getFirstAlternative().isExpandedQuantifier() || group.getLastAlternative().isExpandedQuantifier())) { + if (ast.getOptions().getFlavor().canHaveEmptyLoopIterations() || !root.isCharacterClass()) { + popQuantifierGuard(QuantifierGuard.createExitZeroWidth(quantifier)); + } + } + if (quantifier.hasIndex()) { + quantifierGuardsLoop[quantifier.getIndex()]--; + } + } + } + // Capture groups + if (group.isCapturing()) { + if (ast.getOptions().getFlavor().usesLastGroupResultField() && group.getGroupNumber() != 0) { + popCaptureGroupEvent(); + } + popCaptureGroupEvent(); + } + curPath.pop(); + } + + private void pushGroupPassThrough(Group group, int groupAltIndex) { + curPath.add(createPathElement(group) | PATH_GROUP_ACTION_PASS_THROUGH | (groupAltIndex << PATH_GROUP_ALT_INDEX_OFFSET)); + if (useQuantifierGuards()) { + if (group.hasQuantifier()) { + Quantifier quantifier = group.getQuantifier(); + if (quantifier.hasIndex()) { + if (quantifier.getMin() > 0) { + quantifierGuardsExited[quantifier.getIndex()]++; + pushQuantifierGuard(QuantifierGuard.createExit(quantifier)); + } else { + pushQuantifierGuard(QuantifierGuard.createClear(quantifier)); + } + } + } + if (ast.getOptions().getFlavor().emptyChecksMonitorCaptureGroups() && group.isCapturing()) { + pushQuantifierGuard(QuantifierGuard.createUpdateCG(group.getBoundaryIndexStart())); + pushQuantifierGuard(QuantifierGuard.createUpdateCG(group.getBoundaryIndexEnd())); + } + } + } + + private int popGroupPassThrough(Group group) { + assert pathIsGroupPassThrough(curPath.peek()); + if (useQuantifierGuards()) { + if (ast.getOptions().getFlavor().emptyChecksMonitorCaptureGroups() && group.isCapturing()) { + popQuantifierGuard(QuantifierGuard.createUpdateCG(group.getBoundaryIndexEnd())); + popQuantifierGuard(QuantifierGuard.createUpdateCG(group.getBoundaryIndexStart())); + } + if (group.hasQuantifier()) { + Quantifier quantifier = group.getQuantifier(); + if (quantifier.hasIndex()) { + if (quantifier.getMin() > 0) { + popQuantifierGuard(QuantifierGuard.createExit(quantifier)); + quantifierGuardsExited[quantifier.getIndex()]--; + } else { + popQuantifierGuard(QuantifierGuard.createClear(quantifier)); + } + } + } + } + return pathGetGroupAltIndex(curPath.pop()); + } + + private void switchEnterToPassThrough(Group group) { + int groupAltIndex = popGroupEnter(group); + pushGroupPassThrough(group, groupAltIndex); + } + + private void switchExitToEscape(Group group) { + popGroupExit(group); + pushGroupEscape(group); + } + + private void pushGroupEscape(Group group) { + curPath.add(createPathElement(group) | PATH_GROUP_ACTION_ESCAPE); + // Quantifier guards + if (useQuantifierGuards()) { + if (group.hasQuantifier()) { + Quantifier quantifier = group.getQuantifier(); + if (quantifier.hasIndex()) { + quantifierGuardsExited[quantifier.getIndex()]++; + } + if (quantifier.hasZeroWidthIndex() && (group.getFirstAlternative().isExpandedQuantifier() || group.getLastAlternative().isExpandedQuantifier())) { + pushQuantifierGuard(QuantifierGuard.createEscapeZeroWidth(quantifier)); + } + } + if (ast.getOptions().getFlavor().emptyChecksMonitorCaptureGroups() && group.isCapturing()) { + pushQuantifierGuard(QuantifierGuard.createUpdateCG(group.getBoundaryIndexEnd())); + } + } + } + + private void popGroupEscape(Group group) { + assert pathIsGroupEscape(curPath.peek()); + // Quantifier guards + if (useQuantifierGuards()) { + if (ast.getOptions().getFlavor().emptyChecksMonitorCaptureGroups() && group.isCapturing()) { + popQuantifierGuard(QuantifierGuard.createUpdateCG(group.getBoundaryIndexEnd())); + } + if (group.hasQuantifier()) { + Quantifier quantifier = group.getQuantifier(); + if (quantifier.hasZeroWidthIndex() && (group.getFirstAlternative().isExpandedQuantifier() || group.getLastAlternative().isExpandedQuantifier())) { + popQuantifierGuard(QuantifierGuard.createEscapeZeroWidth(quantifier)); + } + if (quantifier.hasIndex()) { + quantifierGuardsExited[quantifier.getIndex()]--; + } + } + } + curPath.pop(); + } + + /// Capture group data handling + private void clearCaptureGroupData() { + captureGroupEvents.clear(); + captureGroupUpdates.clear(); + captureGroupClears.clear(); + lastGroup = -1; + } + + private void captureGroupUpdate(int boundary) { + captureGroupEvents.add(new CaptureGroupEvent.CaptureGroupUpdate(boundary, captureGroupUpdates.get(boundary), captureGroupClears.get(boundary))); + captureGroupUpdates.set(boundary); + captureGroupClears.clear(boundary); + } + + private void captureGroupClear(int low, int high) { + captureGroupEvents.add(new CaptureGroupEvent.CaptureGroupClears(captureGroupUpdates.copy(), captureGroupClears.copy())); + captureGroupClears.setRange(low, high); + captureGroupUpdates.clearRange(low, high); + } + + private void lastGroupUpdate(int newLastGroup) { + captureGroupEvents.add(new CaptureGroupEvent.LastGroupUpdate(lastGroup)); + lastGroup = newLastGroup; + } + + private void popCaptureGroupEvent() { + assert !captureGroupEvents.isEmpty(); + CaptureGroupEvent poppedEvent = captureGroupEvents.remove(captureGroupEvents.size() - 1); + poppedEvent.undo(this); + } + + /// Quantifier guard data handling + private boolean useQuantifierGuards() { + return !canTraverseLookArounds(); + } + + private void clearQuantifierGuards() { + quantifierGuards = null; + } + + private void pushQuantifierGuard(QuantifierGuard guard) { + assert useQuantifierGuards(); + quantifierGuards = new QuantifierGuardsLinkedList(guard, quantifierGuards); + } + + private void popQuantifierGuard(QuantifierGuard expectedGuard) { + assert useQuantifierGuards(); + assert quantifierGuards != null; + QuantifierGuard droppedGuard = quantifierGuards.getGuard(); + quantifierGuards = quantifierGuards.getPrev(); + assert droppedGuard.equals(expectedGuard); + } + + /// Visited set management + private void addToVisitedSet(StateSet visitedSet) { + nodeVisitCount[cur.getId()]++; + visitedSet.add(cur); + } + + private void removeFromVisitedSet(long pathElement, StateSet visitedSet) { + if (--nodeVisitCount[pathGetNodeId(pathElement)] == 0) { + visitedSet.remove(pathGetNode(pathElement)); + } + } + private boolean nodeVisitsEmpty() { for (int i : nodeVisitCount) { if (i != 0) { @@ -825,6 +1043,20 @@ private boolean nodeVisitsEmpty() { return true; } + /// insideLoops management + private void registerInsideLoop(Group group) { + insideLoops.put(group, insideLoops.get(group, 0) + 1); + } + + private void unregisterInsideLoop(Group group) { + int depth = insideLoops.get(group, 0); + if (depth == 1) { + insideLoops.removeKey(group); + } else if (depth > 1) { + insideLoops.put(group, depth - 1); + } + } + @SuppressWarnings("unused") private void dumpPath() { System.out.println("NEW PATH"); @@ -846,4 +1078,217 @@ private void dumpPath() { } } } + + private static class DeduplicationKey { + protected final StateSet nodesInvolved; + protected final QuantifierGuardsLinkedList quantifierGuards; + protected int hashCode; + + protected DeduplicationKey(RegexASTNode targetNode, StateSet lookAroundsOnPath, StateSet dollarsOnPath, + QuantifierGuardsLinkedList quantifierGuards) { + this.nodesInvolved = lookAroundsOnPath.copy(); + this.nodesInvolved.addAll(dollarsOnPath); + this.nodesInvolved.add(targetNode); + this.quantifierGuards = quantifierGuards; + } + + public static DeduplicationKey create(RegexASTNode targetNode, StateSet lookAroundsOnPath, StateSet dollarsOnPath, + QuantifierGuardsLinkedList quantifierGuards) { + DeduplicationKey key = new DeduplicationKey(targetNode, lookAroundsOnPath, dollarsOnPath, quantifierGuards); + key.hashCode = key.calculateHashCode(); + return key; + } + + @Override + public boolean equals(Object obj) { + if (obj == null || (getClass() != obj.getClass())) { + return false; + } + DeduplicationKey other = (DeduplicationKey) obj; + return this.nodesInvolved.equals(other.nodesInvolved) && Objects.equals(this.quantifierGuards, other.quantifierGuards); + } + + protected int calculateHashCode() { + return Objects.hash(nodesInvolved, quantifierGuards); + } + + @Override + public int hashCode() { + return hashCode; + } + } + + private static final class CGSensitiveDeduplicationKey extends DeduplicationKey { + private final TBitSet captureGroupUpdates; + private final TBitSet captureGroupClears; + private final int lastGroup; + + protected CGSensitiveDeduplicationKey(RegexASTNode targetNode, StateSet lookAroundsOnPath, StateSet dollarsOnPath, + QuantifierGuardsLinkedList quantifierGuards, TBitSet captureGroupUpdates, TBitSet captureGroupClears, int lastGroup) { + super(targetNode, lookAroundsOnPath, dollarsOnPath, quantifierGuards); + this.captureGroupUpdates = captureGroupUpdates.copy(); + this.captureGroupClears = captureGroupClears.copy(); + this.lastGroup = lastGroup; + } + + public static CGSensitiveDeduplicationKey create(RegexASTNode targetNode, StateSet lookAroundsOnPath, StateSet dollarsOnPath, + QuantifierGuardsLinkedList quantifierGuards, TBitSet captureGroupUpdates, TBitSet captureGroupClears, int lastGroup) { + CGSensitiveDeduplicationKey key = new CGSensitiveDeduplicationKey(targetNode, lookAroundsOnPath, dollarsOnPath, quantifierGuards, captureGroupUpdates, captureGroupClears, lastGroup); + key.hashCode = key.calculateHashCode(); + return key; + } + + @Override + public boolean equals(Object obj) { + if (obj == null || (getClass() != obj.getClass())) { + return false; + } + CGSensitiveDeduplicationKey other = (CGSensitiveDeduplicationKey) obj; + return this.nodesInvolved.equals(other.nodesInvolved) && Objects.equals(this.quantifierGuards, other.quantifierGuards) && Objects.equals(captureGroupUpdates, other.captureGroupUpdates) && + Objects.equals(captureGroupClears, other.captureGroupClears) && this.lastGroup == other.lastGroup; + } + + @Override + protected int calculateHashCode() { + return Objects.hash(super.calculateHashCode(), captureGroupUpdates, captureGroupClears, lastGroup); + } + + @Override + public int hashCode() { + return hashCode; + } + } + + private static final class QuantifierGuardsLinkedList { + private final QuantifierGuard guard; + private final QuantifierGuardsLinkedList prev; + private final int length; + private final int hashCode; + + QuantifierGuardsLinkedList(QuantifierGuard guard, QuantifierGuardsLinkedList prev) { + this.guard = guard; + this.prev = prev; + this.length = prev == null ? 1 : prev.length + 1; + this.hashCode = guard.hashCode() + 31 * (prev == null ? 0 : prev.hashCode); + } + + public QuantifierGuardsLinkedList getPrev() { + return prev; + } + + public QuantifierGuard getGuard() { + return guard; + } + + public int getLength() { + return length; + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof QuantifierGuardsLinkedList)) { + return false; + } + QuantifierGuardsLinkedList other = (QuantifierGuardsLinkedList) obj; + return this.hashCode == other.hashCode && this.length == other.length && this.guard.equals(other.guard) && (prev == null || prev.equals(other.prev)); + } + + @Override + public int hashCode() { + return hashCode; + } + + public QuantifierGuard[] toArray() { + QuantifierGuard[] result = new QuantifierGuard[getLength()]; + QuantifierGuardsLinkedList cur = this; + for (int i = result.length - 1; i >= 0; i--) { + result[i] = cur.getGuard(); + cur = cur.getPrev(); + } + return result; + } + + public QuantifierGuard[] toArray(Predicate filter) { + int resultSize = 0; + QuantifierGuardsLinkedList cur = this; + while (cur != null) { + if (filter.test(cur.getGuard())) { + resultSize++; + } + cur = cur.getPrev(); + } + QuantifierGuard[] result = new QuantifierGuard[resultSize]; + cur = this; + while (cur != null) { + if (filter.test(cur.getGuard())) { + result[--resultSize] = cur.getGuard(); + } + cur = cur.getPrev(); + } + return result; + } + } + + private abstract static class CaptureGroupEvent { + + public abstract void undo(NFATraversalRegexASTVisitor visitor); + + private static final class CaptureGroupUpdate extends CaptureGroupEvent { + + private final int boundary; + private final boolean prevUpdate; + private final boolean prevClear; + + CaptureGroupUpdate(int boundary, boolean prevUpdate, boolean prevClear) { + this.boundary = boundary; + this.prevUpdate = prevUpdate; + this.prevClear = prevClear; + } + + @Override + public void undo(NFATraversalRegexASTVisitor visitor) { + if (prevUpdate) { + visitor.captureGroupUpdates.set(boundary); + } else { + visitor.captureGroupUpdates.clear(boundary); + } + if (prevClear) { + visitor.captureGroupClears.set(boundary); + } else { + visitor.captureGroupClears.clear(boundary); + } + } + } + + private static final class CaptureGroupClears extends CaptureGroupEvent { + + private final TBitSet prevUpdates; + private final TBitSet prevClears; + + CaptureGroupClears(TBitSet prevUpdates, TBitSet prevClears) { + this.prevUpdates = prevUpdates; + this.prevClears = prevClears; + } + + @Override + public void undo(NFATraversalRegexASTVisitor visitor) { + visitor.captureGroupUpdates = prevUpdates; + visitor.captureGroupClears = prevClears; + } + } + + private static final class LastGroupUpdate extends CaptureGroupEvent { + + private final int prevLastGroup; + + LastGroupUpdate(int prevLastGroup) { + this.prevLastGroup = prevLastGroup; + } + + @Override + public void undo(NFATraversalRegexASTVisitor visitor) { + visitor.lastGroup = prevLastGroup; + } + } + } }