Skip to content

Commit

Permalink
Automatically anchor regexes beginning with .* (dotnet#1706)
Browse files Browse the repository at this point in the history
  • Loading branch information
stephentoub authored Jan 17, 2020
1 parent 2749e0c commit 4c5e56b
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -174,63 +174,127 @@ private void MakeRep(int type, int min, int max)
}

/// <summary>Performs additional optimizations on an entire tree prior to being used.</summary>
/// <remarks>
/// Some optimizations are performed by the parser while parsing, and others are performed
/// as nodes are being added to the tree. The optimizations here expect the tree to be fully
/// formed, as they inspect relationships between nodes that may not have been in place as
/// individual nodes were being processed/added to the tree.
/// </remarks>
internal RegexNode FinalOptimize()
{
RegexNode rootNode = this;
Debug.Assert(rootNode.Type == Capture && rootNode.ChildCount() == 1);
Debug.Assert(rootNode.Type == Capture);
Debug.Assert(rootNode.Next is null);
Debug.Assert(rootNode.ChildCount() == 1);

// If we find backtracking construct at the end of the regex, we can instead make it non-backtracking,
// since nothing would ever backtrack into it anyway. Doing this then makes the construct available
// to implementations that don't support backtracking.
if ((Options & RegexOptions.RightToLeft) == 0 && // only apply optimization when LTR to avoid needing additional code for the rarer RTL case
(Options & RegexOptions.Compiled) != 0) // only apply when we're compiling, as that's the only time it would make a meaningful difference
if ((Options & RegexOptions.RightToLeft) == 0) // only apply optimization when LTR to avoid needing additional code for the rarer RTL case
{
// Walk the tree, starting from the sole child of the root implicit capture.
RegexNode node = rootNode.Child(0);
while (true)
// Optimization: backtracking removal at expression end.
// If we find backtracking construct at the end of the regex, we can instead make it non-backtracking,
// since nothing would ever backtrack into it anyway. Doing this then makes the construct available
// to implementations that don't support backtracking.
if ((Options & RegexOptions.Compiled) != 0) // only apply when we're compiling, as that's the only time it would make a meaningful difference
{
switch (node.Type)
// Walk the tree, starting from the sole child of the root implicit capture.
RegexNode node = rootNode.Child(0);
while (true)
{
case Oneloop:
node.Type = Oneloopatomic;
break;
switch (node.Type)
{
case Oneloop:
node.Type = Oneloopatomic;
break;

case Notoneloop:
node.Type = Notoneloopatomic;
break;
case Notoneloop:
node.Type = Notoneloopatomic;
break;

case Setloop:
node.Type = Setloopatomic;
break;
case Setloop:
node.Type = Setloopatomic;
break;

case Capture:
case Concatenate:
RegexNode existingChild = node.Child(node.ChildCount() - 1);
switch (existingChild.Type)
{
default:
node = existingChild;
break;

case Alternate:
case Loop:
case Lazyloop:
var atomic = new RegexNode(Atomic, Options);
atomic.AddChild(existingChild);
node.ReplaceChild(node.ChildCount() - 1, atomic);
break;
}
continue;
case Capture:
case Concatenate:
RegexNode existingChild = node.Child(node.ChildCount() - 1);
switch (existingChild.Type)
{
default:
node = existingChild;
break;

case Alternate:
case Loop:
case Lazyloop:
var atomic = new RegexNode(Atomic, Options);
atomic.AddChild(existingChild);
node.ReplaceChild(node.ChildCount() - 1, atomic);
break;
}
continue;

case Atomic:
node = node.Child(0);
continue;
case Atomic:
node = node.Child(0);
continue;
}

break;
}
}

break;
// Optimization: implicit anchoring.
// If the expression begins with a .* loop, add an anchor to the beginning:
// - If Singleline is set such that '.' eats anything, the .* will zip to the end of the string and then backtrack through
// the whole thing looking for a match; since it will have examined everything, there's no benefit to examining it all
// again, and we can anchor to beginning.
// - If Singleline is not set, then '.' eats anything up until a '\n' and backtracks from there, so we can similarly avoid
// re-examining that content and anchor to the beginning of lines.
// We are currently very conservative here, only examining concat nodes. This could be loosened in the future, e.g. to
// explore captures (but think through any implications of there being a back ref to that capture), to explore loops and
// lazy loops a positive minimum (but the anchor shouldn't be part of the loop), to explore alternations and support adding
// an anchor if all of them begin with appropriate star loops (though this could also be accomplished by factoring out the
// loops to be before the alternation), etc.
{
RegexNode node = rootNode.Child(0); // skip implicit root capture node
while (true)
{
bool singleline = (node.Options & RegexOptions.Singleline) != 0;
switch (node.Type)
{
case Concatenate:
node = node.Child(0);
continue;

case Setloop when singleline && node.N == int.MaxValue && node.Str == RegexCharClass.AnyClass:
case Setloopatomic when singleline && node.N == int.MaxValue && node.Str == RegexCharClass.AnyClass:
case Notoneloop when !singleline && node.N == int.MaxValue && node.Ch == '\n':
case Notoneloopatomic when !singleline && node.N == int.MaxValue && node.Ch == '\n':
RegexNode? parent = node.Next;
var anchor = new RegexNode(singleline ? Beginning : Bol, node.Options);
Debug.Assert(parent != null);
if (parent.Type == Concatenate)
{
Debug.Assert(parent.ChildCount() >= 2);
Debug.Assert(parent.Children is List<RegexNode>);
anchor.Next = parent;
((List<RegexNode>)parent.Children).Insert(0, anchor);
}
else
{
Debug.Assert(parent.Type == Capture && parent.Next is null, "Only valid capture is the implicit root capture");
var concat = new RegexNode(Concatenate, parent.Options);
concat.AddChild(anchor);
concat.AddChild(node);
parent.ReplaceChild(0, concat);
}
break;
}

break;
}
}
}

// Optimization: Unnecessary root atomic.
// If the root node under the implicit Capture is an Atomic, the Atomic is useless as there's nothing
// to backtrack into it, so we can remove it.
if (rootNode.Child(0).Type == Atomic)
Expand Down Expand Up @@ -1220,5 +1284,5 @@ public void Dump()
}
}
#endif
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,49 @@ public static IEnumerable<object[]> Groups_Basic_TestData()
yield return new object[] { null, @"(?:(?:[ab]c[de]f){3}){2}", "acdfbcdfacefbcefbcefbcdfacdef", RegexOptions.None, new string[] { "acdfbcdfacefbcefbcefbcdf" } };
yield return new object[] { null, @"(?:(?:[ab]c[de]f){3}hello){2}", "aaaaaacdfbcdfacefhellobcefbcefbcdfhellooooo", RegexOptions.None, new string[] { "acdfbcdfacefhellobcefbcefbcdfhello" } };

// Anchoring loops beginning with .* / .+
yield return new object[] { null, @".*", "", RegexOptions.None, new string[] { "" } };
yield return new object[] { null, @".*", "\n\n\n\n", RegexOptions.None, new string[] { "" } };
yield return new object[] { null, @".*", "\n\n\n\n", RegexOptions.Singleline, new string[] { "\n\n\n\n" } };
yield return new object[] { null, @".*[1a]", "\n\n\n\n1", RegexOptions.None, new string[] { "1" } };
yield return new object[] { null, @"(?s).*(?-s)[1a]", "1\n\n\n\n", RegexOptions.None, new string[] { "1" } };
yield return new object[] { null, @"(?s).*(?-s)[1a]", "\n\n\n\n1", RegexOptions.None, new string[] { "\n\n\n\n1" } };
yield return new object[] { null, @".*|.*|.*", "", RegexOptions.None, new string[] { "" } };
yield return new object[] { null, @".*123|abc", "abc\n123", RegexOptions.None, new string[] { "abc" } };
yield return new object[] { null, @".*123|abc", "abc\n123", RegexOptions.Singleline, new string[] { "abc\n123" } };
yield return new object[] { null, @".*", "\n", RegexOptions.None, new string[] { "" } };
yield return new object[] { null, @".*\n", "\n", RegexOptions.None, new string[] { "\n" } };
yield return new object[] { null, @".*", "\n", RegexOptions.Singleline, new string[] { "\n" } };
yield return new object[] { null, @".*\n", "\n", RegexOptions.Singleline, new string[] { "\n" } };
yield return new object[] { null, @".*", "abc", RegexOptions.None, new string[] { "abc" } };
yield return new object[] { null, @".*abc", "abc", RegexOptions.None, new string[] { "abc" } };
yield return new object[] { null, @".*abc|ghi", "ghi", RegexOptions.None, new string[] { "ghi" } };
yield return new object[] { null, @".*abc|.*ghi", "abcghi", RegexOptions.None, new string[] { "abc" } };
yield return new object[] { null, @".*abc|.*ghi", "bcghi", RegexOptions.None, new string[] { "bcghi" } };
yield return new object[] { null, @".*abc|.+c", " \n \n bc", RegexOptions.None, new string[] { " bc" } };
yield return new object[] { null, @".*abc", "12345 abc", RegexOptions.None, new string[] { "12345 abc" } };
yield return new object[] { null, @".*abc", "12345\n abc", RegexOptions.None, new string[] { " abc" } };
yield return new object[] { null, @".*abc", "12345\n abc", RegexOptions.Singleline, new string[] { "12345\n abc" } };
yield return new object[] { null, @"(.*)abc\1", "\n12345abc12345", RegexOptions.Singleline, new string[] { "12345abc12345", "12345" } };
yield return new object[] { null, @".*\nabc", "\n123\nabc", RegexOptions.None, new string[] { "123\nabc" } };
yield return new object[] { null, @".*\nabc", "\n123\nabc", RegexOptions.Singleline, new string[] { "\n123\nabc" } };
yield return new object[] { null, @".*abc", "abc abc abc \nabc", RegexOptions.None, new string[] { "abc abc abc" } };
yield return new object[] { null, @".*abc", "abc abc abc \nabc", RegexOptions.Singleline, new string[] { "abc abc abc \nabc" } };
yield return new object[] { null, @".*?abc", "abc abc abc \nabc", RegexOptions.None, new string[] { "abc" } };
yield return new object[] { null, @"[^\n]*abc", "123abc\n456abc\n789abc", RegexOptions.None, new string[] { "123abc" } };
yield return new object[] { null, @"[^\n]*abc", "123abc\n456abc\n789abc", RegexOptions.Singleline, new string[] { "123abc" } };
yield return new object[] { null, @"[^\n]*abc", "123ab\n456abc\n789abc", RegexOptions.None, new string[] { "456abc" } };
yield return new object[] { null, @"[^\n]*abc", "123ab\n456abc\n789abc", RegexOptions.Singleline, new string[] { "456abc" } };
yield return new object[] { null, @".+", "a", RegexOptions.None, new string[] { "a" } };
yield return new object[] { null, @".+", "\nabc", RegexOptions.None, new string[] { "abc" } };
yield return new object[] { null, @".+", "\n", RegexOptions.Singleline, new string[] { "\n" } };
yield return new object[] { null, @".+", "\nabc", RegexOptions.Singleline, new string[] { "\nabc" } };
yield return new object[] { null, @".+abc", "aaaabc", RegexOptions.None, new string[] { "aaaabc" } };
yield return new object[] { null, @".+abc", "12345 abc", RegexOptions.None, new string[] { "12345 abc" } };
yield return new object[] { null, @".+abc", "12345\n abc", RegexOptions.None, new string[] { " abc" } };
yield return new object[] { null, @".+abc", "12345\n abc", RegexOptions.Singleline, new string[] { "12345\n abc" } };
yield return new object[] { null, @"(.+)abc\1", "\n12345abc12345", RegexOptions.Singleline, new string[] { "12345abc12345", "12345" } };

// Grouping Constructs Invalid Regular Expressions
yield return new object[] { null, @"()", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } };
yield return new object[] { null, @"(?<cat>)", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } };
Expand Down

0 comments on commit 4c5e56b

Please sign in to comment.