diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
index 469d9f6d3d2cf..3d7bcdc5f1cfc 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
+++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
@@ -2,7 +2,6 @@
System.Text.RegularExpressions
$(DefineConstants);FEATURE_COMPILED
- $(DefineConstants);FEATURE_COMPILEAPIS
true
$(NetCoreAppCurrent)-Debug;$(NetCoreAppCurrent)-Release
enable
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs
index 257300e6e3f52..e39d80f3a68e1 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs
@@ -2,24 +2,32 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
-// This is the only concrete implementation of RegexRunnerFactory,
-// but we cannot combine them due to RegexRunnerFactory having shipped public.
+using System.Reflection.Emit;
namespace System.Text.RegularExpressions
{
internal sealed class CompiledRegexRunnerFactory : RegexRunnerFactory
{
- private readonly Action _go;
- private readonly Func _findFirstChar;
- private readonly Action _initTrackCount;
+ private readonly DynamicMethod _goMethod;
+ private readonly DynamicMethod _findFirstCharMethod;
+ private readonly DynamicMethod _initTrackCountMethod;
- public CompiledRegexRunnerFactory(Action go, Func findFirstChar, Action initTrackCount)
+ // Delegates are lazily created to avoid forcing JIT'ing until the regex is actually executed.
+ private Action? _go;
+ private Func? _findFirstChar;
+ private Action? _initTrackCount;
+
+ public CompiledRegexRunnerFactory(DynamicMethod goMethod, DynamicMethod findFirstCharMethod, DynamicMethod initTrackCountMethod)
{
- _go = go;
- _findFirstChar = findFirstChar;
- _initTrackCount = initTrackCount;
+ _goMethod = goMethod;
+ _findFirstCharMethod = findFirstCharMethod;
+ _initTrackCountMethod = initTrackCountMethod;
}
- protected internal override RegexRunner CreateInstance() => new CompiledRegexRunner(_go, _findFirstChar, _initTrackCount);
+ protected internal override RegexRunner CreateInstance() =>
+ new CompiledRegexRunner(
+ _go ??= (Action)_goMethod.CreateDelegate(typeof(Action)),
+ _findFirstChar ??= (Func)_findFirstCharMethod.CreateDelegate(typeof(Func)),
+ _initTrackCount ??= (Action)_initTrackCountMethod.CreateDelegate(typeof(Action)));
}
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs
index a29209beced54..a16fd0ec90a17 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs
@@ -26,6 +26,7 @@
//
using System.Collections;
+using System.Diagnostics.CodeAnalysis;
using System.Globalization;
namespace System.Text.RegularExpressions
@@ -331,6 +332,7 @@ internal void Tidy(int textpos)
}
#if DEBUG
+ [ExcludeFromCodeCoverage]
internal bool Debug => _regex != null && _regex.Debug;
internal virtual void Dump()
@@ -372,6 +374,7 @@ internal MatchSparse(Regex regex, Hashtable caps, int capcount, string text, int
public override GroupCollection Groups => _groupcoll ??= new GroupCollection(this, _caps);
#if DEBUG
+ [ExcludeFromCodeCoverage]
internal override void Dump()
{
if (_caps != null)
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs
index 13a741dfddbd1..cc505492c164a 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs
@@ -207,7 +207,6 @@ private static RegexRunnerFactory Compile(RegexCode code, RegexOptions options,
}
#endif
-#if FEATURE_COMPILEAPIS
public static void CompileToAssembly(RegexCompilationInfo[] regexinfos, AssemblyName assemblyname)
{
throw new PlatformNotSupportedException(SR.PlatformNotSupported_CompileToAssembly);
@@ -222,7 +221,6 @@ public static void CompileToAssembly(RegexCompilationInfo[] regexinfos, Assembly
{
throw new PlatformNotSupportedException(SR.PlatformNotSupported_CompileToAssembly);
}
-#endif // FEATURE_COMPILEAPIS
///
/// Escapes a minimal set of metacharacters (\, *, +, ?, |, {, [, (, ), ^, $, ., #, and
@@ -461,6 +459,7 @@ protected void InitializeReferences()
///
/// True if the regex has debugging enabled
///
+ [ExcludeFromCodeCoverage]
internal bool Debug => (roptions & RegexOptions.Debug) != 0;
#endif
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs
index c142f5d5a6274..12b7c24cf8210 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs
@@ -12,6 +12,7 @@
// need to be examined.
using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
using System.Globalization;
namespace System.Text.RegularExpressions
@@ -338,8 +339,10 @@ public int Scan(string text, int index, int beglimit, int endlimit)
///
/// Used when dumping for debugging.
///
+ [ExcludeFromCodeCoverage]
public override string ToString() => Pattern;
+ [ExcludeFromCodeCoverage]
public string Dump(string indent)
{
StringBuilder sb = new StringBuilder();
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
index 2c66ad68366f8..d85ad3cb1fb21 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -4,6 +4,7 @@
using System.Collections.Generic;
using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.Threading;
@@ -733,7 +734,7 @@ public static string ConvertOldStringsToClass(string set, string category)
///
public static char SingletonChar(string set)
{
- Debug.Assert(IsSingletonInverse(set), "Tried to get the singleton char out of a non singleton character class");
+ Debug.Assert(IsSingleton(set) || IsSingletonInverse(set), "Tried to get the singleton char out of a non singleton character class");
return set[SetStartIndex];
}
@@ -748,6 +749,20 @@ public static bool IsEmpty(string charClass) =>
!IsNegated(charClass) &&
!IsSubtraction(charClass);
+ /// true if the set contains a single character only
+ ///
+ /// This will happen not only from character classes manually written to contain a single character,
+ /// but much more frequently by the implementation/parser itself, e.g. when looking for \n as part of
+ /// finding the end of a line, when processing an alternation like "hello|hithere" where the first
+ /// character of both options is the same, etc.
+ ///
+ public static bool IsSingleton(string set) =>
+ set[CategoryLengthIndex] == 0 &&
+ set[SetLengthIndex] == 2 &&
+ !IsNegated(set) &&
+ !IsSubtraction(set) &&
+ (set[SetStartIndex] == LastChar || set[SetStartIndex] + 1 == set[SetStartIndex + 1]);
+
public static bool IsSingletonInverse(string set) =>
set[CategoryLengthIndex] == 0 &&
set[SetLengthIndex] == 2 &&
@@ -755,6 +770,117 @@ public static bool IsSingletonInverse(string set) =>
!IsSubtraction(set) &&
(set[SetStartIndex] == LastChar || set[SetStartIndex] + 1 == set[SetStartIndex + 1]);
+ /// Gets all of the characters in the specified set, storing them into the provided span.
+ /// The character class.
+ /// The span into which the chars should be stored.
+ ///
+ /// The number of stored chars. If they won't all fit, 0 is returned.
+ ///
+ ///
+ /// Only considers character classes that only contain sets (no categories), no negation,
+ /// and no subtraction... just simple sets containing starting/ending pairs.
+ ///
+ public static int GetSetChars(string set, Span chars)
+ {
+ if (!CanEasilyEnumerateSetContents(set))
+ {
+ return 0;
+ }
+
+ int setLength = set[SetLengthIndex];
+ int count = 0;
+ for (int i = SetStartIndex; i < SetStartIndex + setLength; i += 2)
+ {
+ int curSetEnd = set[i + 1];
+ for (int c = set[i]; c < curSetEnd; c++)
+ {
+ if (count >= chars.Length)
+ {
+ return 0;
+ }
+
+ chars[count++] = (char)c;
+ }
+ }
+
+ return count;
+ }
+
+ ///
+ /// Determines whether two sets may overlap.
+ ///
+ /// false if the two sets do not overlap; true if they may.
+ ///
+ /// If the method returns false, the caller can be sure the sets do not overlap.
+ /// If the method returns true, it's still possible the sets don't overlap.
+ ///
+ public static bool MayOverlap(string set1, string set2)
+ {
+ // If either set is all-inclusive, there's overlap.
+ if (set1 == AnyClass || set2 == AnyClass)
+ {
+ return true;
+ }
+
+ // If the sets are identical other than one being the negation of the other, they don't overlap.
+ if (IsNegated(set1) != IsNegated(set2) && set1.AsSpan(1).SequenceEqual(set2.AsSpan(1)))
+ {
+ return false;
+ }
+
+ // Special-case some known, common classes that don't overlap.
+ if (KnownDistinctSets(set1, set2) ||
+ KnownDistinctSets(set2, set1))
+ {
+ return false;
+ }
+
+ // If set2 can be easily enumerated (e.g. no unicode categories), then enumerate it and
+ // check if any of its members are in set1. Otherwise, the same for set1.
+ if (CanEasilyEnumerateSetContents(set2))
+ {
+ return MayOverlapByEnumeration(set1, set2);
+ }
+ else if (CanEasilyEnumerateSetContents(set1))
+ {
+ return MayOverlapByEnumeration(set2, set1);
+ }
+
+ // Assume that everything else might overlap. In the future if it proved impactful, we could be more accurate here,
+ // at the exense of more computation time.
+ return true;
+
+ static bool KnownDistinctSets(string set1, string set2) =>
+ (set1 == SpaceClass || set1 == ECMASpaceClass) &&
+ (set2 == DigitClass || set2 == WordClass || set2 == ECMADigitClass || set2 == ECMAWordClass);
+
+ static bool MayOverlapByEnumeration(string set1, string set2)
+ {
+ for (int i = SetStartIndex; i < SetStartIndex + set2[SetLengthIndex]; i += 2)
+ {
+ int curSetEnd = set2[i + 1];
+ for (int c = set2[i]; c < curSetEnd; c++)
+ {
+ if (CharInClass((char)c, set1))
+ {
+ return true;
+ }
+ }
+ }
+
+ return false;
+ }
+ }
+
+ /// Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents.
+ internal static bool CanEasilyEnumerateSetContents(string set) =>
+ set.Length > SetStartIndex &&
+ set[SetLengthIndex] > 0 &&
+ set[SetLengthIndex] % 2 == 0 &&
+ set[CategoryLengthIndex] == 0 &&
+ !IsNegated(set) &&
+ !IsSubtraction(set);
+
internal static bool IsSubtraction(string charClass) =>
charClass.Length > SetStartIndex +
charClass[CategoryLengthIndex] +
@@ -1249,6 +1375,7 @@ private static ReadOnlySpan SetFromProperty(string capname, bool invert, s
///
/// Produces a human-readable description for a set string.
///
+ [ExcludeFromCodeCoverage]
public static string SetDescription(string set)
{
int setLength = set[SetLengthIndex];
@@ -1347,6 +1474,7 @@ public static string SetDescription(string set)
///
/// Produces a human-readable description for a single character.
///
+ [ExcludeFromCodeCoverage]
public static string CharDescription(char ch)
{
if (ch == '\\')
@@ -1382,6 +1510,7 @@ public static string CharDescription(char ch)
return sb.ToString();
}
+ [ExcludeFromCodeCoverage]
private static string CategoryDescription(char ch)
{
if (ch == SpaceConst)
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs
index 33ef2f0772815..8c3da31d7c743 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs
@@ -18,6 +18,7 @@
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
namespace System.Text.RegularExpressions
{
@@ -75,12 +76,18 @@ internal sealed class RegexCode
public const int Testref = 37; // backtrack if ref undefined
public const int Goto = 38; // jump just go
- public const int Prune = 39; // prune it baby
public const int Stop = 40; // done!
public const int ECMABoundary = 41; // \b
public const int NonECMABoundary = 42; // \B
+ // Manufactured primitive operations, derived from the tree that comes from the parser.
+ // These exist to reduce backtracking (both actually performing it and spitting code for it).
+
+ public const int Oneloopatomic = 43; // lef,back char,min,max (?> a {,n} )
+ public const int Notoneloopatomic = 44; // lef,back set,min,max (?> . {,n} )
+ public const int Setloopatomic = 45; // lef,back set,min,max (?> [\d]{,n} )
+
// Modifiers for alternate modes
public const int Mask = 63; // Mask to get unmodified ordinary operator
public const int Rtl = 64; // bit to indicate that we're reverse scanning.
@@ -88,6 +95,7 @@ internal sealed class RegexCode
public const int Back2 = 256; // bit to indicate that we're backtracking on a second branch.
public const int Ci = 512; // bit to indicate that we're case-insensitive.
+ public readonly RegexTree Tree; // the optimized parse tree
public readonly int[] Codes; // the code
public readonly string[] Strings; // the string/set table
public readonly int[]?[] StringsAsciiLookup; // the ASCII lookup table optimization for the sets in Strings
@@ -100,17 +108,15 @@ internal sealed class RegexCode
public readonly int Anchors; // the set of zero-length start anchors (RegexFCD.Bol, etc)
public readonly bool RightToLeft; // true if right to left
- public RegexCode(int[] codes, List stringlist, int trackcount,
- Hashtable? caps, int capsize,
- RegexBoyerMoore? bmPrefix, RegexPrefix? fcPrefix,
- int anchors, bool rightToLeft)
+ public RegexCode(RegexTree tree, int[] codes, string[] strings, int trackcount,
+ Hashtable? caps, int capsize,
+ RegexBoyerMoore? bmPrefix, RegexPrefix? fcPrefix,
+ int anchors, bool rightToLeft)
{
- Debug.Assert(codes != null, "codes cannot be null.");
- Debug.Assert(stringlist != null, "stringlist cannot be null.");
-
+ Tree = tree;
Codes = codes;
- Strings = stringlist.ToArray();
- StringsAsciiLookup = new int[Strings.Length][];
+ Strings = strings;
+ StringsAsciiLookup = new int[strings.Length][];
TrackCount = trackcount;
Caps = caps;
CapSize = capsize;
@@ -190,7 +196,6 @@ public static int OpcodeSize(int opcode)
case Lazybranch:
case Branchmark:
case Lazybranchmark:
- case Prune:
case Set:
return 2;
@@ -200,12 +205,15 @@ public static int OpcodeSize(int opcode)
case Onerep:
case Notonerep:
case Oneloop:
+ case Oneloopatomic:
case Notoneloop:
+ case Notoneloopatomic:
case Onelazy:
case Notonelazy:
case Setlazy:
case Setrep:
case Setloop:
+ case Setloopatomic:
return 3;
default:
@@ -214,36 +222,68 @@ public static int OpcodeSize(int opcode)
}
#if DEBUG
- private static readonly string[] s_codeStr = new string[]
- {
- "Onerep", "Notonerep", "Setrep",
- "Oneloop", "Notoneloop", "Setloop",
- "Onelazy", "Notonelazy", "Setlazy",
- "One", "Notone", "Set",
- "Multi", "Ref",
- "Bol", "Eol", "Boundary", "Nonboundary", "Beginning", "Start", "EndZ", "End",
- "Nothing",
- "Lazybranch", "Branchmark", "Lazybranchmark",
- "Nullcount", "Setcount", "Branchcount", "Lazybranchcount",
- "Nullmark", "Setmark", "Capturemark", "Getmark",
- "Setjump", "Backjump", "Forejump", "Testref", "Goto",
- "Prune", "Stop",
-#if ECMA
- "ECMABoundary", "NonECMABoundary",
-#endif
- };
-
+ [ExcludeFromCodeCoverage]
private static string OperatorDescription(int Opcode)
{
- bool isCi = ((Opcode & Ci) != 0);
- bool isRtl = ((Opcode & Rtl) != 0);
- bool isBack = ((Opcode & Back) != 0);
- bool isBack2 = ((Opcode & Back2) != 0);
-
- return s_codeStr[Opcode & Mask] +
- (isCi ? "-Ci" : "") + (isRtl ? "-Rtl" : "") + (isBack ? "-Back" : "") + (isBack2 ? "-Back2" : "");
+ string codeStr = (Opcode & Mask) switch
+ {
+ Onerep => nameof(Onerep),
+ Notonerep => nameof(Notonerep),
+ Setrep => nameof(Setrep),
+ Oneloop => nameof(Oneloop),
+ Notoneloop => nameof(Notoneloop),
+ Setloop => nameof(Setloop),
+ Onelazy => nameof(Onelazy),
+ Notonelazy => nameof(Notonelazy),
+ Setlazy => nameof(Setlazy),
+ One => nameof(One),
+ Notone => nameof(Notone),
+ Set => nameof(Set),
+ Multi => nameof(Multi),
+ Ref => nameof(Ref),
+ Bol => nameof(Bol),
+ Eol => nameof(Eol),
+ Boundary => nameof(Boundary),
+ Nonboundary => nameof(Nonboundary),
+ Beginning => nameof(Beginning),
+ Start => nameof(Start),
+ EndZ => nameof(EndZ),
+ End => nameof(End),
+ Nothing => nameof(Nothing),
+ Lazybranch => nameof(Lazybranch),
+ Branchmark => nameof(Branchmark),
+ Lazybranchmark => nameof(Lazybranchmark),
+ Nullcount => nameof(Nullcount),
+ Setcount => nameof(Setcount),
+ Branchcount => nameof(Branchcount),
+ Lazybranchcount => nameof(Lazybranchcount),
+ Nullmark => nameof(Nullmark),
+ Setmark => nameof(Setmark),
+ Capturemark => nameof(Capturemark),
+ Getmark => nameof(Getmark),
+ Setjump => nameof(Setjump),
+ Backjump => nameof(Backjump),
+ Forejump => nameof(Forejump),
+ Testref => nameof(Testref),
+ Goto => nameof(Goto),
+ Stop => nameof(Stop),
+ ECMABoundary => nameof(ECMABoundary),
+ NonECMABoundary => nameof(NonECMABoundary),
+ Oneloopatomic => nameof(Oneloopatomic),
+ Notoneloopatomic => nameof(Notoneloopatomic),
+ Setloopatomic => nameof(Setloopatomic),
+ _ => "(unknown)"
+ };
+
+ return
+ codeStr +
+ ((Opcode & Ci) != 0 ? "-Ci" : "") +
+ ((Opcode & Rtl) != 0 ? "-Rtl" : "") +
+ ((Opcode & Back) != 0 ? "-Back" : "") +
+ ((Opcode & Back2) != 0 ? "-Back2" : "");
}
+ [ExcludeFromCodeCoverage]
public string OpcodeDescription(int offset)
{
StringBuilder sb = new StringBuilder();
@@ -263,7 +303,9 @@ public string OpcodeDescription(int offset)
case Onerep:
case Notonerep:
case Oneloop:
+ case Oneloopatomic:
case Notoneloop:
+ case Notoneloopatomic:
case Onelazy:
case Notonelazy:
sb.Append("Ch = ");
@@ -273,6 +315,7 @@ public string OpcodeDescription(int offset)
case Set:
case Setrep:
case Setloop:
+ case Setloopatomic:
case Setlazy:
sb.Append("Set = ");
sb.Append(RegexCharClass.SetDescription(Strings[Codes[offset + 1]]));
@@ -321,11 +364,14 @@ public string OpcodeDescription(int offset)
case Onerep:
case Notonerep:
case Oneloop:
+ case Oneloopatomic:
case Notoneloop:
+ case Notoneloopatomic:
case Onelazy:
case Notonelazy:
case Setrep:
case Setloop:
+ case Setloopatomic:
case Setlazy:
sb.Append(", Rep = ");
if (Codes[offset + 2] == int.MaxValue)
@@ -349,6 +395,7 @@ public string OpcodeDescription(int offset)
return sb.ToString();
}
+ [ExcludeFromCodeCoverage]
public void Dump()
{
int i;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
index e97c25a37a8db..222c68f6c57d6 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -2,7 +2,9 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
+using System.Collections.Generic;
using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.Reflection;
using System.Reflection.Emit;
@@ -26,7 +28,8 @@ internal abstract class RegexCompiler
private static readonly FieldInfo s_runstackField = RegexRunnerField("runstack");
private static readonly FieldInfo s_runtrackcountField = RegexRunnerField("runtrackcount");
- private static readonly MethodInfo s_ensureStorageMethod = RegexRunnerMethod("EnsureStorage");
+ private static readonly MethodInfo s_doubleStackMethod = RegexRunnerMethod("DoubleStack");
+ private static readonly MethodInfo s_doubleTrackMethod = RegexRunnerMethod("DoubleTrack");
private static readonly MethodInfo s_captureMethod = RegexRunnerMethod("Capture");
private static readonly MethodInfo s_transferCaptureMethod = RegexRunnerMethod("TransferCapture");
private static readonly MethodInfo s_uncaptureMethod = RegexRunnerMethod("Uncapture");
@@ -48,14 +51,22 @@ internal abstract class RegexCompiler
private static readonly MethodInfo s_charIsWhiteSpaceMethod = typeof(char).GetMethod("IsWhiteSpace", new Type[] { typeof(char) })!;
private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!;
private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int), typeof(int) })!;
+ private static readonly MethodInfo s_stringIndexOf = typeof(string).GetMethod("IndexOf", new Type[] { typeof(char), typeof(int), typeof(int) })!;
+ private static readonly MethodInfo s_spanIndexOf = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
+ private static readonly MethodInfo s_spanIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
+ private static readonly MethodInfo s_spanIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanGetItemMethod = typeof(ReadOnlySpan).GetMethod("get_Item", new Type[] { typeof(int) })!;
private static readonly MethodInfo s_spanGetLengthMethod = typeof(ReadOnlySpan).GetMethod("get_Length")!;
+ private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int) })!;
+ private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!;
private static readonly MethodInfo s_cultureInfoGetCurrentCultureMethod = typeof(CultureInfo).GetMethod("get_CurrentCulture")!;
+#if DEBUG
+ private static readonly MethodInfo s_debugWriteLine = typeof(Debug).GetMethod("WriteLine", new Type[] { typeof(string) })!;
+#endif
protected ILGenerator? _ilg;
// tokens representing local variables
- private LocalBuilder? _runtextstartLocal;
private LocalBuilder? _runtextbegLocal;
private LocalBuilder? _runtextendLocal;
private LocalBuilder? _runtextposLocal;
@@ -70,7 +81,8 @@ internal abstract class RegexCompiler
private LocalBuilder? _cultureLocal; // current culture is cached in local variable to prevent many thread local storage accesses for CultureInfo.CurrentCulture
private LocalBuilder? _loopTimeoutCounterLocal; // timeout counter for setrep and setloop
- protected RegexCode? _code; // the RegexCode object (used for debugging only)
+ protected RegexOptions _options; // options
+ protected RegexCode? _code; // the RegexCode object
protected int[]? _codes; // the RegexCodes being translated
protected string[]? _strings; // the stringtable associated with the RegexCodes
protected RegexPrefix? _fcPrefix; // the possible first chars computed by RegexFCD
@@ -82,16 +94,12 @@ internal abstract class RegexCompiler
private BacktrackNote[]? _notes; // a list of the backtracking states to be generated
private int _notecount; // true count of _notes (allocation grows exponentially)
protected int _trackcount; // count of backtracking states (used to reduce allocations)
-
private Label _backtrack; // label for backtracking
-
private int _regexopcode; // the current opcode being processed
private int _codepos; // the current code being translated
private int _backpos; // the current backtrack-note being translated
- protected RegexOptions _options; // options
-
// special code fragments
private int[]? _uniquenote; // _notes indices for code that should be emitted <= once
private int[]? _goto; // indices for forward-jumps-through-switch (for allocations)
@@ -289,6 +297,9 @@ private void Ldc(int i)
/// A macro for _ilg.Emit(OpCodes.Sub) or _ilg.Emit(OpCodes.Add).
private void Sub(bool negate) => _ilg!.Emit(negate ? OpCodes.Add : OpCodes.Sub);
+ /// A macro for _ilg.Emit(OpCodes.Mul).
+ private void Mul() => _ilg!.Emit(OpCodes.Mul);
+
/// A macro for _ilg.Emit(OpCodes.And).
private void And() => _ilg!.Emit(OpCodes.And);
@@ -307,7 +318,7 @@ private void Ldc(int i)
/// A macro for _ilg.Emit(OpCodes.Ldind_U2).
private void LdindU2() => _ilg!.Emit(OpCodes.Ldind_U2);
- /// A macro for _ilg.Emit(OpCodes.Stloc).
+ /// A macro for _ilg.Emit(OpCodes.Stloc_S).
private void Stloc(LocalBuilder lt) => _ilg!.Emit(OpCodes.Stloc_S, lt);
/// A macro for _ilg.Emit(OpCodes.Ldarg_0).
@@ -317,7 +328,7 @@ private void Ldc(int i)
private void Ldthisfld(FieldInfo ft)
{
Ldthis();
- _ilg!.Emit(OpCodes.Ldfld, ft);
+ Ldfld(ft);
}
/// A macro for Ldthis(); Ldfld(); Stloc();
@@ -335,6 +346,9 @@ private void Mvlocfld(LocalBuilder lt, FieldInfo ft)
Stfld(ft);
}
+ /// A macro for _ilg.Emit(OpCodes.Ldfld).
+ private void Ldfld(FieldInfo ft) => _ilg!.Emit(OpCodes.Ldfld, ft);
+
/// A macro for _ilg.Emit(OpCodes.Stfld).
private void Stfld(FieldInfo ft) => _ilg!.Emit(OpCodes.Stfld, ft);
@@ -359,9 +373,15 @@ private void Mvlocfld(LocalBuilder lt, FieldInfo ft)
/// A macro for _ilg.Emit(OpCodes.Blt) (long form).
private void BltFar(Label l) => _ilg!.Emit(OpCodes.Blt, l);
+ /// A macro for _ilg.Emit(OpCodes.Blt_Un) (long form).
+ private void BltUnFar(Label l) => _ilg!.Emit(OpCodes.Blt_Un, l);
+
/// A macro for _ilg.Emit(OpCodes.Bge) (long form).
private void BgeFar(Label l) => _ilg!.Emit(OpCodes.Bge, l);
+ /// A macro for _ilg.Emit(OpCodes.Bge_Un) (long form).
+ private void BgeUnFar(Label l) => _ilg!.Emit(OpCodes.Bge_Un, l);
+
/// A macro for _ilg.Emit(OpCodes.Bgt) (long form).
private void BgtFar(Label l) => _ilg!.Emit(OpCodes.Bgt, l);
@@ -401,9 +421,31 @@ private void Mvlocfld(LocalBuilder lt, FieldInfo ft)
/// A macro for _ilg.Emit(OpCodes.Beq_S) (short jump).
private void Beq(Label l) => _ilg!.Emit(OpCodes.Beq_S, l);
- /// A macro for the Ldlen instruction).
+ /// A macro for the Ldlen instruction.
private void Ldlen() => _ilg!.Emit(OpCodes.Ldlen);
+ /// A macro for the Ldelem_I4 instruction.
+ private void LdelemI4() => _ilg!.Emit(OpCodes.Ldelem_I4);
+
+ /// A macro for the Stelem_I4 instruction.
+ private void StelemI4() => _ilg!.Emit(OpCodes.Stelem_I4);
+
+ private void Switch(Label[] table) => _ilg!.Emit(OpCodes.Switch, table);
+
+ /// Declares a local int.
+ private LocalBuilder DeclareInt32() => _ilg!.DeclareLocal(typeof(int));
+
+ /// Declares a local CultureInfo.
+ private LocalBuilder? DeclareCultureInfo() => _ilg!.DeclareLocal(typeof(CultureInfo)); // cache local variable to avoid unnecessary TLS
+
+ /// Declares a local int[].
+ private LocalBuilder DeclareInt32Array() => _ilg!.DeclareLocal(typeof(int[]));
+
+ /// Declares a local string.
+ private LocalBuilder DeclareString() => _ilg!.DeclareLocal(typeof(string));
+
+ private LocalBuilder DeclareReadOnlySpanChar() => _ilg!.DeclareLocal(typeof(ReadOnlySpan));
+
/// Loads the char to the right of the current position.
private void Rightchar()
{
@@ -417,11 +459,11 @@ private void Rightcharnext()
{
Ldloc(_runtextLocal!);
Ldloc(_runtextposLocal!);
- Dup();
+ Callvirt(s_stringGetCharsMethod);
+ Ldloc(_runtextposLocal!);
Ldc(1);
Add();
Stloc(_runtextposLocal!);
- Callvirt(s_stringGetCharsMethod);
}
/// Loads the char to the left of the current position.
@@ -499,83 +541,83 @@ private void TrackUnique2(int i)
/// Prologue to code that will push an element on the tracking stack.
private void ReadyPushTrack()
{
- _ilg!.Emit(OpCodes.Ldloc_S, _runtrackLocal!);
- _ilg.Emit(OpCodes.Ldloc_S, _runtrackposLocal!);
- _ilg.Emit(OpCodes.Ldc_I4_1);
- _ilg.Emit(OpCodes.Sub);
- _ilg.Emit(OpCodes.Dup);
- _ilg.Emit(OpCodes.Stloc_S, _runtrackposLocal!);
+ Ldloc(_runtrackLocal!);
+ Ldloc(_runtrackposLocal!);
+ Ldc(1);
+ Sub();
+ Dup();
+ Stloc(_runtrackposLocal!);
}
/// Pops an element off the tracking stack (leave it on the operand stack).
private void PopTrack()
{
- _ilg!.Emit(OpCodes.Ldloc_S, _runtrackLocal!);
- _ilg.Emit(OpCodes.Ldloc_S, _runtrackposLocal!);
- _ilg.Emit(OpCodes.Dup);
- _ilg.Emit(OpCodes.Ldc_I4_1);
- _ilg.Emit(OpCodes.Add);
- _ilg.Emit(OpCodes.Stloc_S, _runtrackposLocal!);
- _ilg.Emit(OpCodes.Ldelem_I4);
+ Ldloc(_runtrackLocal!);
+ Ldloc(_runtrackposLocal!);
+ LdelemI4();
+ Ldloc(_runtrackposLocal!);
+ Ldc(1);
+ Add();
+ Stloc(_runtrackposLocal!);
}
/// Retrieves the top entry on the tracking stack without popping.
private void TopTrack()
{
- _ilg!.Emit(OpCodes.Ldloc_S, _runtrackLocal!);
- _ilg.Emit(OpCodes.Ldloc_S, _runtrackposLocal!);
- _ilg.Emit(OpCodes.Ldelem_I4);
+ Ldloc(_runtrackLocal!);
+ Ldloc(_runtrackposLocal!);
+ LdelemI4();
}
/// Saves the value of a local variable on the grouping stack.
private void PushStack(LocalBuilder lt)
{
ReadyPushStack();
- _ilg!.Emit(OpCodes.Ldloc_S, lt);
+ Ldloc(lt);
DoPush();
}
/// Prologue to code that will replace the ith element on the grouping stack.
internal void ReadyReplaceStack(int i)
{
- _ilg!.Emit(OpCodes.Ldloc_S, _runstackLocal!);
- _ilg.Emit(OpCodes.Ldloc_S, _runstackposLocal!);
+ Ldloc(_runstackLocal!);
+ Ldloc(_runstackposLocal!);
if (i != 0)
{
Ldc(i);
- _ilg.Emit(OpCodes.Add);
+ Add();
}
}
/// Prologue to code that will push an element on the grouping stack.
private void ReadyPushStack()
{
- _ilg!.Emit(OpCodes.Ldloc_S, _runstackLocal!);
- _ilg.Emit(OpCodes.Ldloc_S, _runstackposLocal!);
- _ilg.Emit(OpCodes.Ldc_I4_1);
- _ilg.Emit(OpCodes.Sub);
- _ilg.Emit(OpCodes.Dup);
- _ilg.Emit(OpCodes.Stloc_S, _runstackposLocal!);
+ Ldloc(_runstackLocal!);
+ Ldloc(_runstackposLocal!);
+ Ldc(1);
+ Sub();
+ Dup();
+ Stloc(_runstackposLocal!);
}
/// Retrieves the top entry on the stack without popping.
private void TopStack()
{
- _ilg!.Emit(OpCodes.Ldloc_S, _runstackLocal!);
- _ilg.Emit(OpCodes.Ldloc_S, _runstackposLocal!);
- _ilg.Emit(OpCodes.Ldelem_I4);
+ Ldloc(_runstackLocal!);
+ Ldloc(_runstackposLocal!);
+ LdelemI4();
}
/// Pops an element off the grouping stack (leave it on the operand stack).
private void PopStack()
{
- _ilg!.Emit(OpCodes.Ldloc_S, _runstackLocal!);
- _ilg.Emit(OpCodes.Ldloc_S, _runstackposLocal!);
- _ilg.Emit(OpCodes.Dup);
- _ilg.Emit(OpCodes.Ldc_I4_1);
- _ilg.Emit(OpCodes.Add);
- _ilg.Emit(OpCodes.Stloc_S, _runstackposLocal!);
- _ilg.Emit(OpCodes.Ldelem_I4);
+ Ldloc(_runstackLocal!);
+ Ldloc(_runstackposLocal!);
+ LdelemI4();
+ Ldloc(_runstackposLocal!);
+ Ldc(1);
+ Add();
+ Stloc(_runstackposLocal!);
}
/// Pops 1 element off the grouping stack and discards it.
@@ -584,20 +626,20 @@ private void PopStack()
/// Pops i elements off the grouping stack and discards them.
private void PopDiscardStack(int i)
{
- _ilg!.Emit(OpCodes.Ldloc_S, _runstackposLocal!);
+ Ldloc(_runstackposLocal!);
Ldc(i);
- _ilg.Emit(OpCodes.Add);
- _ilg.Emit(OpCodes.Stloc_S, _runstackposLocal!);
+ Add();
+ Stloc(_runstackposLocal!);
}
/// Epilogue to code that will replace an element on a stack (use Ld* in between).
- private void DoReplace() => _ilg!.Emit(OpCodes.Stelem_I4);
+ private void DoReplace() => StelemI4();
/// Epilogue to code that will push an element on a stack (use Ld* in between).
- private void DoPush() => _ilg!.Emit(OpCodes.Stelem_I4);
+ private void DoPush() => StelemI4();
/// Jump to the backtracking switch.
- private void Back() => _ilg!.Emit(OpCodes.Br, _backtrack);
+ private void Back() => BrFar(_backtrack);
///
/// Branch to the MSIL corresponding to the regex code at i
@@ -650,7 +692,7 @@ private void Goto(int i)
private Label AdvanceLabel() => _labels![NextCodepos()];
/// Goto the next (forward) operation.
- private void Advance() => _ilg!.Emit(OpCodes.Br, AdvanceLabel());
+ private void Advance() => BrFar(AdvanceLabel());
/// Sets the culture local to CultureInfo.CurrentCulture.
private void InitLocalCultureInfo()
@@ -684,25 +726,22 @@ private void CallToLower()
///
private void GenerateForwardSection()
{
+ _uniquenote = new int[Uniquecount];
_labels = new Label[_codes!.Length];
_goto = new int[_codes.Length];
// initialize
- int codepos;
- for (codepos = 0; codepos < _codes.Length; codepos += RegexCode.OpcodeSize(_codes[codepos]))
+ Array.Fill(_uniquenote, -1);
+ for (int codepos = 0; codepos < _codes.Length; codepos += RegexCode.OpcodeSize(_codes[codepos]))
{
_goto[codepos] = -1;
- _labels[codepos] = _ilg!.DefineLabel();
+ _labels[codepos] = DefineLabel();
}
- _uniquenote = new int[Uniquecount];
- Array.Fill(_uniquenote, -1);
-
// emit variable initializers
Mvfldloc(s_runtextField, _runtextLocal!);
- Mvfldloc(s_runtextstartField, _runtextstartLocal!);
Mvfldloc(s_runtextbegField, _runtextbegLocal!);
Mvfldloc(s_runtextendField, _runtextendLocal!);
Mvfldloc(s_runtextposField, _runtextposLocal!);
@@ -713,7 +752,7 @@ private void GenerateForwardSection()
_backpos = -1;
- for (codepos = 0; codepos < _codes.Length; codepos += RegexCode.OpcodeSize(_codes[codepos]))
+ for (int codepos = 0; codepos < _codes.Length; codepos += RegexCode.OpcodeSize(_codes[codepos]))
{
MarkLabel(_labels[codepos]);
_codepos = codepos;
@@ -730,28 +769,65 @@ private void GenerateForwardSection()
///
private void GenerateMiddleSection()
{
- // Backtrack switch
+ LocalBuilder limitLocal = _temp1Local!;
+ Label afterDoubleStack = DefineLabel();
+ Label afterDoubleTrack = DefineLabel();
+
+ // Backtrack:
MarkLabel(_backtrack);
- // first call EnsureStorage
- Mvlocfld(_runtrackposLocal!, s_runtrackposField);
+ // (Equivalent of EnsureStorage, but written to avoid unnecessary local spilling.)
+
+ // int limitLocal = runtrackcount * 4;
+ Ldthisfld(s_runtrackcountField);
+ Ldc(4);
+ Mul();
+ Stloc(limitLocal);
+
+ // if (runstackpos < limit)
+ // {
+ // this.runstackpos = runstackpos;
+ // DoubleStack(); // might change runstackpos and runstack
+ // runstackpos = this.runstackpos;
+ // runstack = this.runstack;
+ // }
+ Ldloc(_runstackposLocal!);
+ Ldloc(limitLocal);
+ Bge(afterDoubleStack);
Mvlocfld(_runstackposLocal!, s_runstackposField);
Ldthis();
- Callvirt(s_ensureStorageMethod);
- Mvfldloc(s_runtrackposField, _runtrackposLocal!);
+ Callvirt(s_doubleStackMethod);
Mvfldloc(s_runstackposField, _runstackposLocal!);
- Mvfldloc(s_runtrackField, _runtrackLocal!);
Mvfldloc(s_runstackField, _runstackLocal!);
+ MarkLabel(afterDoubleStack);
+
+ // if (runtrackpos < limit)
+ // {
+ // this.runtrackpos = runtrackpos;
+ // DoubleTrack(); // might change runtrackpos and runtrack
+ // runtrackpos = this.runtrackpos;
+ // runtrack = this.runtrack;
+ // }
+ Ldloc(_runtrackposLocal!);
+ Ldloc(limitLocal);
+ Bge(afterDoubleTrack);
+ Mvlocfld(_runtrackposLocal!, s_runtrackposField);
+ Ldthis();
+ Callvirt(s_doubleTrackMethod);
+ Mvfldloc(s_runtrackposField, _runtrackposLocal!);
+ Mvfldloc(s_runtrackField, _runtrackLocal!);
+ MarkLabel(afterDoubleTrack);
+ // runtrack[runtrackpos++]
PopTrack();
+ // Backtracking jump table
var table = new Label[_notecount];
for (int i = 0; i < _notecount; i++)
{
table[i] = _notes![i]._label;
}
-
- _ilg!.Emit(OpCodes.Switch, table);
+ Switch(table);
}
///
@@ -765,7 +841,7 @@ private void GenerateBacktrackSection()
BacktrackNote n = _notes![i];
if (n._flags != 0)
{
- _ilg!.MarkLabel(n._label);
+ MarkLabel(n._label);
_codepos = n._codepos;
_backpos = i;
_regexopcode = _codes![n._codepos] | n._flags;
@@ -783,10 +859,11 @@ private void GenerateBacktrackSection()
///
protected void GenerateFindFirstChar()
{
- _runtextposLocal = DeclareInt();
+ _runtextposLocal = DeclareInt32();
+ _runtextendLocal = DeclareInt32();
_runtextLocal = DeclareString();
- _temp1Local = DeclareInt();
- _temp2Local = DeclareInt();
+ _temp1Local = DeclareInt32();
+ _temp2Local = DeclareInt32();
_cultureLocal = null;
if (!_options.HasFlag(RegexOptions.CultureInvariant))
{
@@ -1029,7 +1106,7 @@ protected void GenerateFindFirstChar()
}
Ldloc(chLocal);
- _ilg!.Emit(OpCodes.Switch, table);
+ Switch(table);
for (int i = _bmPrefix.LowASCII; i <= _bmPrefix.HighASCII; i++)
{
@@ -1123,190 +1200,1398 @@ protected void GenerateFindFirstChar()
Ldc(0);
BleFar(l4);
- MarkLabel(l1);
+ MarkLabel(l1);
+
+ Ldloc(cLocal);
+ Ldc(1);
+ Sub();
+ Stloc(cLocal);
+
+ Leftcharnext();
+
+ if (!RegexCharClass.IsSingleton(_fcPrefix.GetValueOrDefault().Prefix))
+ {
+ EmitCallCharInClass(_fcPrefix.GetValueOrDefault().Prefix, _fcPrefix.GetValueOrDefault().CaseInsensitive, charInClassLocal);
+ BrtrueFar(l2);
+ }
+ else
+ {
+ Ldc(RegexCharClass.SingletonChar(_fcPrefix.GetValueOrDefault().Prefix));
+ Beq(l2);
+ }
+
+ MarkLabel(l5);
+
+ Ldloc(cLocal);
+ Ldc(0);
+ if (!RegexCharClass.IsSingleton(_fcPrefix.GetValueOrDefault().Prefix))
+ {
+ BgtFar(l1);
+ }
+ else
+ {
+ Bgt(l1);
+ }
+
+ Ldc(0);
+ BrFar(l3);
+
+ MarkLabel(l2);
+
+ Ldloc(_runtextposLocal);
+ Ldc(1);
+ Sub(_code.RightToLeft);
+ Stloc(_runtextposLocal);
+ Ldc(1);
+
+ MarkLabel(l3);
+
+ Mvlocfld(_runtextposLocal, s_runtextposField);
+ Ret();
+
+ MarkLabel(l4);
+ Ldc(0);
+ Ret();
+ }
+ else // for left-to-right, we can take advantage of vectorization and JIT optimizations
+ {
+ LocalBuilder iLocal = _temp2Local;
+ Label returnFalseLabel = DefineLabel();
+ Label updatePosAndReturnFalse = DefineLabel();
+
+ Mvfldloc(s_runtextposField, _runtextposLocal);
+ Mvfldloc(s_runtextendField, _runtextendLocal);
+
+ // if (runtextend > runtextpos)
+ Ldloc(_runtextendLocal);
+ Ldloc(_runtextposLocal);
+ BleFar(returnFalseLabel);
+
+ Span setChars = stackalloc char[3];
+ int setCharsCount;
+ if (!_fcPrefix.GetValueOrDefault().CaseInsensitive &&
+ (setCharsCount = RegexCharClass.GetSetChars(_fcPrefix.GetValueOrDefault().Prefix, setChars)) > 0)
+ {
+ // This is a case-sensitive class with a small number of characters in the class, small enough
+ // that we can generate an IndexOf{Any} call. That takes advantage of optimizations in
+ // IndexOf{Any}, such as vectorization, which our open-coded loop through the span doesn't have.
+ switch (setCharsCount)
+ {
+ case 1:
+ // int i = this.runtext.IndexOf(setChars[0], runtextpos, runtextend - runtextpos);
+ Ldthisfld(s_runtextField);
+ Ldc(setChars[0]);
+ Ldloc(_runtextposLocal);
+ Ldloc(_runtextendLocal);
+ Ldloc(_runtextposLocal);
+ Sub();
+ Call(s_stringIndexOf);
+ Stloc(iLocal);
+
+ // if (i >= 0)
+ Ldloc(iLocal);
+ Ldc(0);
+ BltFar(updatePosAndReturnFalse);
+
+ // runtextpos = i; return true;
+ Mvlocfld(iLocal, s_runtextposField);
+ Ldc(1);
+ Ret();
+ break;
+
+ case 2:
+ case 3:
+ // int i = this.runtext.AsSpan(runtextpos, runtextend - runtextpos).IndexOfAny(setChars[0], setChars[1]{, setChars[2]});
+ Ldthisfld(s_runtextField);
+ Ldloc(_runtextposLocal);
+ Ldloc(_runtextendLocal);
+ Ldloc(_runtextposLocal);
+ Sub();
+ Call(s_stringAsSpanMethod);
+ Ldc(setChars[0]);
+ Ldc(setChars[1]);
+ if (setCharsCount == 3)
+ {
+ Ldc(setChars[2]);
+ Call(s_spanIndexOfAnyCharCharChar);
+ }
+ else
+ {
+ Call(s_spanIndexOfAnyCharChar);
+ }
+ Stloc(iLocal);
+
+ // if (i >= 0)
+ Ldloc(iLocal);
+ Ldc(0);
+ BltFar(updatePosAndReturnFalse);
+
+ // this.runtextpos = runtextpos + i; return true;
+ Ldthis();
+ Ldloc(_runtextposLocal);
+ Ldloc(iLocal);
+ Add();
+ Stfld(s_runtextposField);
+ Ldc(1);
+ Ret();
+ break;
+
+ default:
+ Debug.Fail("Unexpected setCharsCount: " + setCharsCount);
+ break;
+ }
+ }
+ else
+ {
+ // Either this isn't a class with just a few characters in it, or this is case insensitive.
+ // Either way, create a span and iterate through it rather than the original string in order
+ // to avoid bounds checks on each access.
+
+ LocalBuilder charInClassLocal = _temp1Local;
+ _temp3Local = DeclareReadOnlySpanChar();
+ LocalBuilder textSpanLocal = _temp3Local;
+
+ Label checkSpanLengthLabel = DefineLabel();
+ Label charNotInClassLabel = DefineLabel();
+ Label loopBody = DefineLabel();
+
+ // ReadOnlySpan span = this.runtext.AsSpan(runtextpos, runtextend - runtextpos);
+ Ldthisfld(s_runtextField);
+ Ldloc(_runtextposLocal);
+ Ldloc(_runtextendLocal);
+ Ldloc(_runtextposLocal);
+ Sub();
+ Call(s_stringAsSpanMethod);
+ Stloc(textSpanLocal);
+
+ // for (int i = 0;
+ Ldc(0);
+ Stloc(iLocal);
+ BrFar(checkSpanLengthLabel);
+
+ // if (CharInClass(span[i], "..."))
+ MarkLabel(loopBody);
+ Ldloca(textSpanLocal);
+ Ldloc(iLocal);
+ Call(s_spanGetItemMethod);
+ LdindU2();
+ EmitCallCharInClass(_fcPrefix.GetValueOrDefault().Prefix, _fcPrefix.GetValueOrDefault().CaseInsensitive, charInClassLocal);
+ BrfalseFar(charNotInClassLabel);
+
+ // thisruntextpos = runtextpos + i; return true;
+ Ldthis();
+ Ldloc(_runtextposLocal);
+ Ldloc(iLocal);
+ Add();
+ Stfld(s_runtextposField);
+ Ldc(1);
+ Ret();
+
+ // for (...; ...; i++)
+ MarkLabel(charNotInClassLabel);
+ Ldloc(iLocal);
+ Ldc(1);
+ Add();
+ Stloc(iLocal);
+
+ // for (...; i < span.Length; ...);
+ MarkLabel(checkSpanLengthLabel);
+ Ldloc(iLocal);
+ Ldloca(textSpanLocal);
+ Call(s_spanGetLengthMethod);
+ BltFar(loopBody);
+ }
+
+ // runtextpos = runtextend;
+ MarkLabel(updatePosAndReturnFalse);
+ Ldthis();
+ Ldloc(_runtextendLocal);
+ Stfld(s_runtextposField);
+
+ // return false;
+ MarkLabel(returnFalseLabel);
+ Ldc(0);
+ Ret();
+ }
+ }
+
+ /// Generates a very simple method that sets the _trackcount field.
+ protected void GenerateInitTrackCount()
+ {
+ Ldthis();
+ Ldc(_trackcount);
+ Stfld(s_runtrackcountField);
+ Ret();
+ }
+
+ private bool TryGenerateNonBacktrackingGo(RegexNode node)
+ {
+ Debug.Assert(node.Type == RegexNode.Capture && node.ChildCount() == 1,
+ "Every generated tree should begin with a capture node that has a single child.");
+
+ // RightToLeft is rare and not worth adding a lot of custom code to handle in this path.
+ if ((node.Options & RegexOptions.RightToLeft) != 0)
+ {
+ return false;
+ }
+
+ // Skip the Capture node. This path only supports the implicit capture of the whole match,
+ // which we handle implicitly at the end of the generated code in one location.
+ node = node.Child(0);
+ if (!NodeSupportsNonBacktrackingImplementation(node, level: 0))
+ {
+ return false;
+ }
+
+ // We've determined that the RegexNode can be handled with this optimized path. Generate the code.
+#if DEBUG
+ if ((_options & RegexOptions.Debug) != 0)
+ {
+ Debug.WriteLine("Using optimized non-backtracking code gen.");
+ }
+#endif
+
+ // Declare some locals.
+ LocalBuilder runtextLocal = DeclareString();
+ LocalBuilder originalruntextposLocal = DeclareInt32();
+ LocalBuilder runtextposLocal = DeclareInt32();
+ LocalBuilder textSpanLocal = DeclareReadOnlySpanChar();
+ Stack? iterationLocals = null;
+ Stack? spanLocals = null;
+ Label stopSuccessLabel = DefineLabel();
+ Label doneLabel = DefineLabel();
+ if (_hasTimeout)
+ {
+ _loopTimeoutCounterLocal = DeclareInt32();
+ }
+
+ // CultureInfo culture = CultureInfo.CurrentCulture; // only if the whole expression or any subportion is ignoring case, and we're not using invariant
+ InitializeCultureForGoIfNecessary();
+
+ // string runtext = this.runtext;
+ Ldthisfld(s_runtextField);
+ Stloc(runtextLocal);
+
+ // int originalruntextpos, runtextpos;
+ // runtextpos = originalruntextpos = this.runtextpos;
+ Ldthisfld(s_runtextposField);
+ Dup();
+ Stloc(originalruntextposLocal);
+ Stloc(runtextposLocal);
+
+ // The implementation tries to use const indexes into the span wherever possible, which we can do
+ // in all places except for variable-length loops. For everything else, we know at any point in
+ // the regex exactly how far into it we are, and we can use that to index into the span created
+ // at the beginning of the routine to begin at exactly where we're starting in the input. For
+ // variable-length loops, we index at this textSpanPos + i, and then after the loop we slice the input
+ // by i so that this position is still accurate for everything after it.
+ int textSpanPos = 0;
+ LoadTextSpanLocal();
+
+ // Emit the code for all nodes in the tree.
+ EmitNode(node);
+
+ // Success:
+ // this.runtextpos = runtextpos + textSpanPos;
+ MarkLabel(stopSuccessLabel);
+ Ldthis();
+ Ldloc(runtextposLocal);
+ if (textSpanPos > 0)
+ {
+ Ldc(textSpanPos);
+ Add();
+ }
+ Stfld(s_runtextposField);
+
+ // Capture(0, originalruntextposLocal, this.runtextpos);
+ Ldthis();
+ Ldc(0);
+ Ldloc(originalruntextposLocal);
+ Ldthisfld(s_runtextposField);
+ Callvirt(s_captureMethod);
+
+ // Done:
+ // return;
+ MarkLabel(doneLabel);
+ Ret();
+
+ // Generated code successfully with non-backtracking implementation.
+ return true;
+
+ // Determines whether the node supports an optimized implementation that doesn't allow for backtracking.
+ static bool NodeSupportsNonBacktrackingImplementation(RegexNode node, int level)
+ {
+ bool supported = false;
+
+ // We only support the default left-to-right, not right-to-left, which requires more complication in the gerated code.
+ // (Right-to-left is only employed when explicitly asked for by the developer or by lookbehind assertions.)
+ // We also limit the recursion involved to prevent stack dives; this limitation can be removed by switching
+ // away from a recursive implementation (done for convenience) to an iterative one that's more complicated
+ // but within the same problems.
+ if ((node.Options & RegexOptions.RightToLeft) == 0 &&
+ level < 20) // arbitrary cut-off to limit stack dives
+ {
+ int childCount = node.ChildCount();
+
+ switch (node.Type)
+ {
+ // One/Notone/Set/Multi don't involve any repetition and are easily supported.
+ case RegexNode.One:
+ case RegexNode.Notone:
+ case RegexNode.Set:
+ case RegexNode.Multi:
+ // Boundaries are like set checks and don't involve repetition, either.
+ case RegexNode.Boundary:
+ case RegexNode.Nonboundary:
+ case RegexNode.ECMABoundary:
+ case RegexNode.NonECMABoundary:
+ // Anchors are also trivial.
+ case RegexNode.Beginning:
+ case RegexNode.Start:
+ case RegexNode.Bol:
+ case RegexNode.Eol:
+ case RegexNode.End:
+ case RegexNode.EndZ:
+ // {Set/One/Notone}loopatomic are optimized nodes that represent non-backtracking variable-length loops.
+ // These consume their {Set/One} inputs as long as they match, and don't give up anything they
+ // matched, which means we can support them without backtracking.
+ case RegexNode.Oneloopatomic:
+ case RegexNode.Notoneloopatomic:
+ case RegexNode.Setloopatomic:
+ // "Empty" is easy: nothing is emitted for it.
+ // "Nothing" is also easy: it doesn't match anything.
+ case RegexNode.Empty:
+ case RegexNode.Nothing:
+ supported = true;
+ break;
+
+ // Repeaters don't require backtracking as long as their min and max are equal.
+ // At that point they're just a shorthand for writing out the One/Notone/Set
+ // that number of times.
+ case RegexNode.Oneloop:
+ case RegexNode.Notoneloop:
+ case RegexNode.Setloop:
+ Debug.Assert(node.Next == null || node.Next.Type != RegexNode.Atomic, "Loop should have been transformed into an atomic type.");
+ goto case RegexNode.Onelazy;
+ case RegexNode.Onelazy:
+ case RegexNode.Notonelazy:
+ case RegexNode.Setlazy:
+ supported = node.M == node.N || (node.Next != null && node.Next.Type == RegexNode.Atomic);
+ break;
+
+ // {Lazy}Loop repeaters are the same, except their child also needs to be supported.
+ // We also support such loops being atomic.
+ case RegexNode.Loop:
+ case RegexNode.Lazyloop:
+ supported =
+ (node.M == node.N || (node.Next != null && node.Next.Type == RegexNode.Atomic)) &&
+ NodeSupportsNonBacktrackingImplementation(node.Child(0), level + 1);
+ break;
+
+ // We can handle atomic as long as we can handle making its child atomic, or
+ // its child doesn't have that concept.
+ case RegexNode.Atomic:
+ // Lookahead assertions also only require that the child node be supported.
+ // The RightToLeft check earlier is important to differentiate lookbehind,
+ // which is not supported.
+ case RegexNode.Require:
+ case RegexNode.Prevent:
+ supported = NodeSupportsNonBacktrackingImplementation(node.Child(0), level + 1);
+ break;
+
+ // We can handle alternates as long as they're atomic (a root / global alternate is
+ // effectively atomic, as nothing will try to backtrack into it as it's the last thing).
+ // Its children must all also be supported.
+ case RegexNode.Alternate:
+ if (node.Next != null &&
+ (node.Next.Type == RegexNode.Atomic || // atomic alternate
+ (node.Next.Type == RegexNode.Capture && node.Next.Next is null))) // root alternate
+ {
+ goto case RegexNode.Concatenate;
+ }
+ break;
+
+ // Concatenation doesn't require backtracking as long as its children don't.
+ case RegexNode.Concatenate:
+ supported = true;
+ for (int i = 0; i < childCount; i++)
+ {
+ if (supported && !NodeSupportsNonBacktrackingImplementation(node.Child(i), level + 1))
+ {
+ supported = false;
+ break;
+ }
+ }
+ break;
+ }
+ }
+#if DEBUG
+ if (!supported && (node.Options & RegexOptions.Debug) != 0)
+ {
+ Debug.WriteLine($"Unable to use non-backtracking code gen: node {node.Description()} isn't supported.");
+ }
+#endif
+ return supported;
+ }
+
+ static bool IsCaseInsensitive(RegexNode node) => (node.Options & RegexOptions.IgnoreCase) != 0;
+
+ // Creates a span for runtext starting at runtextpos until this.runtextend.
+ void LoadTextSpanLocal()
+ {
+ // textSpan = runtext.AsSpan(runtextpos, this.runtextend - runtextpos);
+ Ldloc(runtextLocal);
+ Ldloc(runtextposLocal);
+ Ldthisfld(s_runtextendField);
+ Ldloc(runtextposLocal);
+ Sub();
+ Call(s_stringAsSpanMethod);
+ Stloc(textSpanLocal);
+ }
+
+ // Rents an Int32 local. We want to minimize the number of locals we create, so we maintain
+ // a pool of them, only adding when needing, and nested constructs that each need their own
+ // independent local can use this to get one.
+ LocalBuilder RentInt32Local()
+ {
+ iterationLocals ??= new Stack(1);
+ return iterationLocals.TryPop(out LocalBuilder? iterationLocal) ? iterationLocal : DeclareInt32();
+ }
+
+ // Returns a rented Int32 local.
+ void ReturnInt32Local(LocalBuilder int32Local)
+ {
+ Debug.Assert(iterationLocals != null);
+ Debug.Assert(int32Local.LocalType == typeof(int));
+ iterationLocals.Push(int32Local);
+ }
+
+ LocalBuilder RentReadOnlySpanCharLocal()
+ {
+ spanLocals ??= new Stack(1);
+ return spanLocals.TryPop(out LocalBuilder? iterationLocal) ? iterationLocal : DeclareReadOnlySpanChar();
+ }
+
+ void ReturnReadOnlySpanCharLocal(LocalBuilder spanLocal)
+ {
+ Debug.Assert(spanLocals != null);
+ Debug.Assert(spanLocal.LocalType == typeof(ReadOnlySpan));
+ spanLocals.Push(spanLocal);
+ }
+
+ void EmitSum(int constant, LocalBuilder? local)
+ {
+ if (local == null)
+ {
+ Ldc(constant);
+ }
+ else if (constant == 0)
+ {
+ Ldloc(local);
+ }
+ else
+ {
+ Ldloc(local);
+ Ldc(constant);
+ Add();
+ }
+ }
+
+ // Emits a check that the span is large enough at the currently known static position to handle the required additional length.
+ void EmitSpanLengthCheck(int requiredLength, LocalBuilder? dynamicRequiredLength = null)
+ {
+ // if ((uint)(textSpanPos + requiredLength + dynamicRequiredLength - 1) >= (uint)textSpan.Length) goto Done;
+ Debug.Assert(requiredLength > 0);
+ EmitSum(textSpanPos + requiredLength - 1, dynamicRequiredLength);
+ Ldloca(textSpanLocal);
+ Call(s_spanGetLengthMethod);
+ BgeUnFar(doneLabel);
+ }
+
+ void TransferTextSpanPosToRunTextPos()
+ {
+ if (textSpanPos > 0)
+ {
+ // runtextpos += textSpanPos;
+ Ldloc(runtextposLocal);
+ Ldc(textSpanPos);
+ Add();
+ Stloc(runtextposLocal);
+
+ // textSpan = textSpan.Slice(textSpanPos);
+ Ldloca(textSpanLocal);
+ Ldc(textSpanPos);
+ Call(s_spanSliceIntMethod);
+ Stloc(textSpanLocal);
+
+ // textSpanPos = 0;
+ textSpanPos = 0;
+ }
+ }
+
+ // Emits the code for an atomic alternate, one that once a branch successfully matches is non-backtracking into it.
+ // This amounts to generating the code for each branch, with failures in a branch resetting state to what it was initially
+ // and then jumping to the next branch. We don't need to worry about uncapturing, because capturing is only allowed for the
+ // implicit capture that happens for the whole match at the end.
+ void EmitAtomicAlternate(RegexNode node)
+ {
+ // int startingTextSpanPos = textSpanPos;
+ // int startingRunTextPos = runtextpos;
+ //
+ // Branch0(); // jumps to NextBranch1 on failure
+ // goto Success;
+ //
+ // NextBranch1:
+ // runtextpos = originalruntextpos;
+ // textSpan = originalTextSpan;
+ // Branch1(); // jumps to NextBranch2 on failure
+ // goto Success;
+ //
+ // ...
+ //
+ // NextBranchN:
+ // runtextpos = startingRunTextPos;
+ // textSpan = this.runtext.AsSpan(runtextpos, this.runtextend - runtextpos);
+ // textSpanPos = startingTextSpanPos;
+ // BranchN(); // jumps to Done on failure
+
+ // Save off runtextpos. We'll need to reset this each time a branch fails.
+ LocalBuilder startingRunTextPos = RentInt32Local();
+ Ldloc(runtextposLocal);
+ Stloc(startingRunTextPos);
+ int startingTextSpanPos = textSpanPos;
+
+ // Label to jump to when any branch completes successfully.
+ Label doneAlternate = DefineLabel();
+
+ // A failure in a branch other than the last should jump to the next
+ // branch, not to the final done.
+ Label postAlternateDone = doneLabel;
+
+ int childCount = node.ChildCount();
+ for (int i = 0; i < childCount - 1; i++)
+ {
+ Label nextBranch = DefineLabel();
+ doneLabel = nextBranch;
+
+ // Emit the code for each branch.
+ EmitNode(node.Child(i));
+
+ // If we get here in the generated code, the branch completed successfully.
+ // Before jumping to the end, we need to zero out textSpanPos, so that no
+ // matter what the value is after the branch, whatever follows the alternate
+ // will see the same textSpanPos.
+ TransferTextSpanPosToRunTextPos();
+ BrFar(doneAlternate);
+
+ // Reset state for next branch and loop around to generate it.
+ MarkLabel(nextBranch);
+ Ldloc(startingRunTextPos);
+ Stloc(runtextposLocal);
+ LoadTextSpanLocal();
+ textSpanPos = startingTextSpanPos;
+ }
+
+ // If the final branch fails, that's like any other failure, and we jump to done.
+ doneLabel = postAlternateDone;
+ EmitNode(node.Child(childCount - 1));
+ TransferTextSpanPosToRunTextPos();
+
+ // Successfully completed the alternate.
+ MarkLabel(doneAlternate);
+ ReturnInt32Local(startingRunTextPos);
+
+ Debug.Assert(textSpanPos == 0);
+ }
+
+ void EmitPositiveLookaheadAssertion(RegexNode node)
+ {
+ // Save off runtextpos. We'll need to reset this upon successful completion of the lookahead.
+ LocalBuilder startingRunTextPos = RentInt32Local();
+ Ldloc(runtextposLocal);
+ Stloc(startingRunTextPos);
+ int startingTextSpanPos = textSpanPos;
+
+ // Emit the child.
+ EmitNode(node.Child(0));
+
+ // After the child completes successfully, reset the text positions.
+ Ldloc(startingRunTextPos);
+ Stloc(runtextposLocal);
+ LoadTextSpanLocal();
+ textSpanPos = startingTextSpanPos;
+
+ ReturnInt32Local(startingRunTextPos);
+ }
+
+ void EmitNegativeLookaheadAssertion(RegexNode node)
+ {
+ // Save off runtextpos. We'll need to reset this upon successful completion of the lookahead.
+ LocalBuilder startingRunTextPos = RentInt32Local();
+ Ldloc(runtextposLocal);
+ Stloc(startingRunTextPos);
+ int startingTextSpanPos = textSpanPos;
+
+ Label originalDoneLabel = doneLabel;
+ doneLabel = DefineLabel();
+
+ // Emit the child.
+ EmitNode(node.Child(0));
+
+ // If the generated code ends up here, it matched the lookahead, which actually
+ // means failure for a _negative_ lookahead, so we need to jump to the original done.
+ BrFar(originalDoneLabel);
+
+ // Failures (success for a negative lookahead) jump here.
+ MarkLabel(doneLabel);
+ doneLabel = originalDoneLabel;
+
+ // After the child completes in failure (success for negative lookahead), reset the text positions.
+ Ldloc(startingRunTextPos);
+ Stloc(runtextposLocal);
+ LoadTextSpanLocal();
+ textSpanPos = startingTextSpanPos;
+
+ ReturnInt32Local(startingRunTextPos);
+ }
+
+ // Emits the code for the node.
+ void EmitNode(RegexNode node)
+ {
+ switch (node.Type)
+ {
+ case RegexNode.One:
+ case RegexNode.Notone:
+ case RegexNode.Set:
+ EmitSingleChar(node);
+ break;
+
+ case RegexNode.Boundary:
+ case RegexNode.Nonboundary:
+ case RegexNode.ECMABoundary:
+ case RegexNode.NonECMABoundary:
+ EmitBoundary(node);
+ break;
+
+ case RegexNode.Beginning:
+ case RegexNode.Start:
+ case RegexNode.Bol:
+ case RegexNode.Eol:
+ case RegexNode.End:
+ case RegexNode.EndZ:
+ EmitAnchors(node);
+ break;
+
+ case RegexNode.Multi:
+ EmitMultiChar(node);
+ break;
+
+ case RegexNode.Oneloopatomic:
+ case RegexNode.Notoneloopatomic:
+ case RegexNode.Setloopatomic:
+ EmitAtomicSingleCharLoop(node);
+ break;
+
+ case RegexNode.Loop:
+ EmitAtomicNodeLoop(node);
+ break;
+
+ case RegexNode.Lazyloop:
+ // An atomic lazy loop amounts to doing the minimum amount of work possible.
+ // That means iterating as little as is required, which means a repeater
+ // for the min, and if min is 0, doing nothing.
+ Debug.Assert(node.M == node.N || (node.Next != null && node.Next.Type == RegexNode.Atomic));
+ if (node.M > 0)
+ {
+ EmitNodeRepeater(node);
+ }
+ break;
+
+ case RegexNode.Atomic:
+ EmitNode(node.Child(0));
+ break;
+
+ case RegexNode.Alternate:
+ EmitAtomicAlternate(node);
+ break;
+
+ case RegexNode.Oneloop:
+ case RegexNode.Onelazy:
+ case RegexNode.Notoneloop:
+ case RegexNode.Notonelazy:
+ case RegexNode.Setloop:
+ case RegexNode.Setlazy:
+ EmitSingleCharRepeater(node);
+ break;
+
+ case RegexNode.Concatenate:
+ int childCount = node.ChildCount();
+ for (int i = 0; i < childCount; i++)
+ {
+ EmitNode(node.Child(i));
+ }
+ break;
+
+ case RegexNode.Require:
+ EmitPositiveLookaheadAssertion(node);
+ break;
+
+ case RegexNode.Prevent:
+ EmitNegativeLookaheadAssertion(node);
+ break;
+
+ case RegexNode.Nothing:
+ BrFar(doneLabel);
+ break;
+
+ case RegexNode.Empty:
+ // Emit nothing.
+ break;
+
+ default:
+ Debug.Fail($"Unexpected node type: {node.Type}");
+ break;
+ }
+ }
+
+ // Emits the code to handle a single-character match.
+ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? offset = null)
+ {
+ // if ((uint)(textSpanPos + offset) >= textSpan.Length || textSpan[textSpanPos + offset] != ch) goto Done;
+ if (emitLengthCheck)
+ {
+ EmitSpanLengthCheck(1, offset);
+ }
+ Ldloca(textSpanLocal);
+ EmitSum(textSpanPos, offset);
+ Call(s_spanGetItemMethod);
+ LdindU2();
+ switch (node.Type)
+ {
+ // This only emits a single check, but it's called from the looping constructs in a loop
+ // to generate the code for a single check, so we map those looping constructs to the
+ // appropriate single check.
+
+ case RegexNode.Set:
+ case RegexNode.Setlazy:
+ case RegexNode.Setloop:
+ case RegexNode.Setloopatomic:
+ LocalBuilder setScratchLocal = RentInt32Local();
+ EmitCallCharInClass(node.Str!, IsCaseInsensitive(node), setScratchLocal);
+ ReturnInt32Local(setScratchLocal);
+ BrfalseFar(doneLabel);
+ break;
+
+ case RegexNode.One:
+ case RegexNode.Onelazy:
+ case RegexNode.Oneloop:
+ case RegexNode.Oneloopatomic:
+ if (IsCaseInsensitive(node)) CallToLower();
+ Ldc(node.Ch);
+ BneFar(doneLabel);
+ break;
+
+ default:
+ Debug.Assert(node.Type == RegexNode.Notone || node.Type == RegexNode.Notonelazy || node.Type == RegexNode.Notoneloop || node.Type == RegexNode.Notoneloopatomic);
+ if (IsCaseInsensitive(node)) CallToLower();
+ Ldc(node.Ch);
+ BeqFar(doneLabel);
+ break;
+ }
+
+ textSpanPos++;
+ }
+
+ // Emits the code to handle a boundary check on a character.
+ void EmitBoundary(RegexNode node)
+ {
+ // if (!IsBoundary(runtextpos + textSpanPos, this.runtextbeg, this.runtextend)) goto doneLabel;
+ Ldthis();
+ Ldloc(runtextposLocal);
+ if (textSpanPos > 0)
+ {
+ Ldc(textSpanPos);
+ Add();
+ }
+ Ldthisfld(s_runtextbegField!);
+ Ldthisfld(s_runtextendField!);
+ switch (node.Type)
+ {
+ case RegexNode.Boundary:
+ Callvirt(s_isBoundaryMethod);
+ BrfalseFar(doneLabel);
+ break;
+
+ case RegexNode.Nonboundary:
+ Callvirt(s_isBoundaryMethod);
+ BrtrueFar(doneLabel);
+ break;
+
+ case RegexNode.ECMABoundary:
+ Callvirt(s_isECMABoundaryMethod);
+ BrfalseFar(doneLabel);
+ break;
+
+ default:
+ Debug.Assert(node.Type == RegexNode.NonECMABoundary);
+ Callvirt(s_isECMABoundaryMethod);
+ BrtrueFar(doneLabel);
+ break;
+ }
+ }
+
+ // Emits the code to handle various anchors.
+ void EmitAnchors(RegexNode node)
+ {
+ Debug.Assert(textSpanPos >= 0);
+ switch (node.Type)
+ {
+ case RegexNode.Beginning:
+ case RegexNode.Start:
+ if (textSpanPos > 0)
+ {
+ // If we statically know we've already matched part of the regex, there's no way we're at the
+ // beginning or start, as we've already progressed past it.
+ BrFar(doneLabel);
+ }
+ else
+ {
+ // if (runtextpos > this.runtextbeg/start) goto doneLabel;
+ Ldloc(runtextposLocal);
+ Ldthisfld(node.Type == RegexNode.Beginning ? s_runtextbegField : s_runtextstartField);
+ BneFar(doneLabel);
+ }
+ break;
+
+ case RegexNode.Bol:
+ if (textSpanPos > 0)
+ {
+ // if (textSpan[textSpanPos - 1] != '\n') goto doneLabel;
+ Ldloca(textSpanLocal);
+ Ldc(textSpanPos - 1);
+ Call(s_spanGetItemMethod);
+ LdindU2();
+ Ldc('\n');
+ BneFar(doneLabel);
+ }
+ else
+ {
+ // We can't use our textSpan in this case, because we'd need to access textSpan[-1], so we access the runtext field directly:
+ // if (runtextpos > this.runtextbeg && this.runtext[runtextpos - 1] != '\n') goto doneLabel;
+ Label success = DefineLabel();
+ Ldloc(runtextposLocal);
+ Ldthisfld(s_runtextbegField);
+ Ble(success);
+ Ldthisfld(s_runtextField);
+ Ldloc(runtextposLocal);
+ Ldc(1);
+ Sub();
+ Callvirt(s_stringGetCharsMethod);
+ Ldc('\n');
+ BneFar(doneLabel);
+ MarkLabel(success);
+ }
+ break;
+
+ case RegexNode.End:
+ // if (textSpanPos < textSpan.Length) goto doneLabel;
+ Ldc(textSpanPos);
+ Ldloca(textSpanLocal);
+ Call(s_spanGetLengthMethod);
+ BltUnFar(doneLabel);
+ break;
+
+ case RegexNode.EndZ:
+ // if (textSpanPos < textSpan.Length - 1) goto doneLabel;
+ Ldc(textSpanPos);
+ Ldloca(textSpanLocal);
+ Call(s_spanGetLengthMethod);
+ Ldc(1);
+ Sub();
+ BltFar(doneLabel);
+ goto case RegexNode.Eol;
+
+ case RegexNode.Eol:
+ // if (textSpanPos < textSpan.Length && textSpan[textSpanPos] != '\n') goto doneLabel;
+ {
+ Label success = DefineLabel();
+ Ldc(textSpanPos);
+ Ldloca(textSpanLocal);
+ Call(s_spanGetLengthMethod);
+ BgeUnFar(success);
+ Ldloca(textSpanLocal);
+ Ldc(textSpanPos);
+ Call(s_spanGetItemMethod);
+ LdindU2();
+ Ldc('\n');
+ BneFar(doneLabel);
+ MarkLabel(success);
+ }
+ break;
+ }
+ }
+
+ // Emits the code to handle a multiple-character match.
+ void EmitMultiChar(RegexNode node)
+ {
+ // if (textSpanPos + node.Str.Length >= textSpan.Length) goto doneLabel;
+ // if (node.Str[0] != textSpan[textSpanPos]) goto doneLabel;
+ // if (node.Str[1] != textSpan[textSpanPos+1]) goto doneLabel;
+ // ...
+ EmitSpanLengthCheck(node.Str!.Length);
+ for (int i = 0; i < node.Str!.Length; i++)
+ {
+ Ldloca(textSpanLocal);
+ Ldc(textSpanPos + i);
+ Call(s_spanGetItemMethod);
+ LdindU2();
+ if (IsCaseInsensitive(node)) CallToLower();
+ Ldc(node.Str[i]);
+ BneFar(doneLabel);
+ }
+
+ textSpanPos += node.Str.Length;
+ }
+
+ // Emits the code to handle a loop (repeater) with a fixed number of iterations.
+ // RegexNode.M is used for the number of iterations; RegexNode.N is ignored.
+ void EmitSingleCharRepeater(RegexNode node)
+ {
+ int iterations = node.M;
+
+ if (iterations == 0)
+ {
+ // No iterations, nothing to do.
+ return;
+ }
+
+ // if ((uint)(textSpanPos + iterations - 1) >= (uint)textSpan.Length) goto doneLabel;
+ EmitSpanLengthCheck(iterations);
+
+ // Arbitrary limit for unrolling vs creating a loop. We want to balance size in the generated
+ // code with other costs, like the (small) overhead of slicing to create the temp span to iterate.
+ const int MaxUnrollSize = 16;
+
+ if (iterations <= MaxUnrollSize)
+ {
+ // if (textSpan[textSpanPos] != c1 ||
+ // textSpan[textSpanPos + 1] != c2 ||
+ // ...)
+ // goto doneLabel;
+ for (int i = 0; i < iterations; i++)
+ {
+ EmitSingleChar(node, emitLengthCheck: false);
+ }
+ }
+ else
+ {
+ // ReadOnlySpan tmp = textSpan.Slice(textSpanPos, iterations);
+ // for (int i = 0; i < tmp.Length; i++)
+ // {
+ // TimeoutCheck();
+ // if (tmp[i] != ch) goto Done;
+ // }
+ // textSpanPos += iterations;
+
+ Label conditionLabel = DefineLabel();
+ Label bodyLabel = DefineLabel();
+ LocalBuilder iterationLocal = RentInt32Local();
+ LocalBuilder spanLocal = RentReadOnlySpanCharLocal();
+
+ Ldloca(textSpanLocal);
+ Ldc(textSpanPos);
+ Ldc(iterations);
+ Call(s_spanSliceIntIntMethod);
+ Stloc(spanLocal);
+
+ Ldc(0);
+ Stloc(iterationLocal);
+ BrFar(conditionLabel);
+
+ MarkLabel(bodyLabel);
+ EmitTimeoutCheck();
+
+ LocalBuilder tmpTextSpanLocal = textSpanLocal; // we want EmitSingleChar to refer to this temporary
+ int tmpTextSpanPos = textSpanPos;
+ textSpanLocal = spanLocal;
+ textSpanPos = 0;
+ EmitSingleChar(node, emitLengthCheck: false, offset: iterationLocal);
+ textSpanLocal = tmpTextSpanLocal;
+ textSpanPos = tmpTextSpanPos;
+
+ Ldloc(iterationLocal);
+ Ldc(1);
+ Add();
+ Stloc(iterationLocal);
+
+ MarkLabel(conditionLabel);
+ Ldloc(iterationLocal);
+ Ldloca(spanLocal);
+ Call(s_spanGetLengthMethod);
+ BltFar(bodyLabel);
+
+ ReturnReadOnlySpanCharLocal(spanLocal);
+ ReturnInt32Local(iterationLocal);
+
+ textSpanPos += iterations;
+ }
+ }
+
+ // Emits the code to handle a loop (repeater) with a fixed number of iterations.
+ // This is used both to handle the case of A{5, 5} where the min and max are equal,
+ // and also to handle part of the case of A{3, 5}, where this method is called to
+ // handle the A{3, 3} portion, and then remaining A{0, 2} is handled separately.
+ void EmitNodeRepeater(RegexNode node)
+ {
+ int iterations = node.M;
+ Debug.Assert(iterations > 0);
+
+ if (iterations == 1)
+ {
+ Debug.Assert(node.ChildCount() == 1);
+ EmitNode(node.Child(0));
+ return;
+ }
+
+ // Ensure textSpanPos is 0 prior to emitting the child.
+ TransferTextSpanPosToRunTextPos();
+
+ // for (int i = 0; i < iterations; i++)
+ // {
+ // TimeoutCheck();
+ // if (textSpan[textSpanPos] != ch) goto Done;
+ // }
+
+ Label conditionLabel = DefineLabel();
+ Label bodyLabel = DefineLabel();
+ LocalBuilder iterationLocal = RentInt32Local();
+
+ Ldc(0);
+ Stloc(iterationLocal);
+ BrFar(conditionLabel);
+
+ MarkLabel(bodyLabel);
+ EmitTimeoutCheck();
+
+ Debug.Assert(node.ChildCount() == 1);
+ Debug.Assert(textSpanPos == 0);
+ EmitNode(node.Child(0));
+ TransferTextSpanPosToRunTextPos();
+
+ Ldloc(iterationLocal);
+ Ldc(1);
+ Add();
+ Stloc(iterationLocal);
+
+ MarkLabel(conditionLabel);
+ Ldloc(iterationLocal);
+ Ldc(iterations);
+ BltFar(bodyLabel);
+
+ ReturnInt32Local(iterationLocal);
+ }
+
+ // Emits the code to handle a non-backtracking, variable-length loop around a single character comparison.
+ void EmitAtomicSingleCharLoop(RegexNode node)
+ {
+ Debug.Assert(
+ node.Type == RegexNode.Oneloopatomic ||
+ node.Type == RegexNode.Notoneloopatomic ||
+ node.Type == RegexNode.Setloopatomic);
+ Debug.Assert(node.M < int.MaxValue);
+
+ // First generate the code to handle the required number of iterations.
+ if (node.M == node.N)
+ {
+ EmitSingleCharRepeater(node);
+ return;
+ }
+
+ Debug.Assert(node.N > node.M);
+ int minIterations = node.M;
+ int maxIterations = node.N;
+
+ LocalBuilder iterationLocal = RentInt32Local();
+
+ Label originalDoneLabel = doneLabel;
+ doneLabel = DefineLabel();
+
+ if (node.Type == RegexNode.Notoneloopatomic && maxIterations == int.MaxValue && !IsCaseInsensitive(node))
+ {
+ // For Notoneloopatomic, we're looking for a specific character, as everything until we find
+ // it is consumed by the loop. If we're unbounded, such as with ".*" and if we're case-sensitive,
+ // we can use the vectorized IndexOf to do the search, rather than open-coding it. (In the future,
+ // we could consider using IndexOf with StringComparison for case insensitivity.)
+
+ // int i = textSpan.Slice(textSpanPos).IndexOf(char);
+ if (textSpanPos > 0)
+ {
+ Ldloca(textSpanLocal);
+ Ldc(textSpanPos);
+ Call(s_spanSliceIntMethod);
+ }
+ else
+ {
+ Ldloc(textSpanLocal);
+ }
+ Ldc(node.Ch);
+ Call(s_spanIndexOf);
+ Stloc(iterationLocal);
+
+ // if (i != -1) goto doneLabel;
+ Ldloc(iterationLocal);
+ Ldc(-1);
+ BneFar(doneLabel);
- Ldloc(cLocal);
- Ldc(1);
- Sub();
- Stloc(cLocal);
+ // i = textSpan.Length - textSpanPos;
+ Ldloca(textSpanLocal);
+ Call(s_spanGetLengthMethod);
+ if (textSpanPos > 0)
+ {
+ Ldc(textSpanPos);
+ Sub();
+ }
+ Stloc(iterationLocal);
+ }
+ else
+ {
+ // For everything else, do a normal loop.
- Leftcharnext();
+ // Transfer text pos to runtextpos to help with bounds check elimination on the loop.
+ TransferTextSpanPosToRunTextPos();
- EmitCallCharInClass(_fcPrefix.GetValueOrDefault().Prefix, _fcPrefix.GetValueOrDefault().CaseInsensitive, charInClassLocal);
- BrtrueFar(l2);
+ Label conditionLabel = DefineLabel();
+ Label bodyLabel = DefineLabel();
- MarkLabel(l5);
+ // int i = 0;
+ Ldc(0);
+ Stloc(iterationLocal);
+ BrFar(conditionLabel);
+
+ // Body:
+ // TimeoutCheck();
+ MarkLabel(bodyLabel);
+ EmitTimeoutCheck();
+
+ // if ((uint)i >= (uint)textSpan.Length) goto doneLabel;
+ Ldloc(iterationLocal);
+ Ldloca(textSpanLocal);
+ Call(s_spanGetLengthMethod);
+ BgeUnFar(doneLabel);
+
+ // if (textSpan[i] != ch) goto Done;
+ Ldloca(textSpanLocal);
+ Ldloc(iterationLocal);
+ Call(s_spanGetItemMethod);
+ LdindU2();
+ switch (node.Type)
+ {
+ case RegexNode.Oneloopatomic:
+ if (IsCaseInsensitive(node)) CallToLower();
+ Ldc(node.Ch);
+ BneFar(doneLabel);
+ break;
+ case RegexNode.Notoneloopatomic:
+ if (IsCaseInsensitive(node)) CallToLower();
+ Ldc(node.Ch);
+ BeqFar(doneLabel);
+ break;
+ case RegexNode.Setloopatomic:
+ LocalBuilder setScratchLocal = RentInt32Local();
+ EmitCallCharInClass(node.Str!, IsCaseInsensitive(node), setScratchLocal);
+ ReturnInt32Local(setScratchLocal);
+ BrfalseFar(doneLabel);
+ break;
+ }
- Ldloc(cLocal);
- Ldc(0);
- BgtFar(l1);
+ // i++;
+ Ldloc(iterationLocal);
+ Ldc(1);
+ Add();
+ Stloc(iterationLocal);
- Ldc(0);
- BrFar(l3);
+ // if (i >= maxIterations) goto doneLabel;
+ MarkLabel(conditionLabel);
+ if (maxIterations != int.MaxValue)
+ {
+ Ldloc(iterationLocal);
+ Ldc(maxIterations);
+ BltFar(bodyLabel);
+ }
+ else
+ {
+ BrFar(bodyLabel);
+ }
+ }
- MarkLabel(l2);
+ // Done:
+ MarkLabel(doneLabel);
+ doneLabel = originalDoneLabel; // Restore the original done label
- Ldloc(_runtextposLocal);
- Ldc(1);
- Sub(_code.RightToLeft);
- Stloc(_runtextposLocal);
- Ldc(1);
+ // Check to ensure we've found at least min iterations.
+ if (minIterations > 0)
+ {
+ Ldloc(iterationLocal);
+ Ldc(minIterations);
+ BltFar(doneLabel);
+ }
- MarkLabel(l3);
+ // Now that we've completed our optional iterations, advance the text span
+ // and runtextpos by the number of iterations completed.
- Mvlocfld(_runtextposLocal, s_runtextposField);
- Ret();
+ // textSpan = textSpan.Slice(i);
+ Ldloca(textSpanLocal);
+ Ldloc(iterationLocal);
+ Call(s_spanSliceIntMethod);
+ Stloc(textSpanLocal);
- MarkLabel(l4);
- Ldc(0);
- Ret();
+ // runtextpos += i;
+ Ldloc(runtextposLocal);
+ Ldloc(iterationLocal);
+ Add();
+ Stloc(runtextposLocal);
+
+ ReturnInt32Local(iterationLocal);
}
- else // for left-to-right, use span to avoid bounds checks when doing normal forward iteration recognized by the JIT
- {
- LocalBuilder charInClassLocal = _temp1Local;
- LocalBuilder iLocal = _temp2Local;
- _temp3Local = DeclareReadOnlySpanChar();
- LocalBuilder textSpanLocal = _temp3Local;
- Label returnFalseLabel = DefineLabel();
- Label checkSpanLengthLabel = DefineLabel();
- Label loopBody = DefineLabel();
- Label charNotInClassLabel = DefineLabel();
+ // Emits the code to handle a non-backtracking, variable-length loop around another node.
+ void EmitAtomicNodeLoop(RegexNode node)
+ {
+ Debug.Assert(node.Type == RegexNode.Loop);
+ Debug.Assert(node.M == node.N || (node.Next != null && node.Next.Type == RegexNode.Atomic));
+ Debug.Assert(node.M < int.MaxValue);
- // string runtext = this.runtext
- Mvfldloc(s_runtextField, _runtextLocal);
+ if (node.M == node.N)
+ {
+ EmitNodeRepeater(node);
+ return;
+ }
- // if (runtextend - runtextpos > 0)
- Ldthisfld(s_runtextendField);
- Ldthisfld(s_runtextposField);
- Sub();
- Ldc(0);
- BleFar(returnFalseLabel);
+ LocalBuilder iterationLocal = RentInt32Local();
- // ReadOnlySpan span = runtext.AsSpan(runtextpos, runtextend - runtextpos);
- Ldloc(_runtextLocal);
- Ldthisfld(s_runtextposField);
- Ldthisfld(s_runtextendField);
- Ldthisfld(s_runtextposField);
- Sub();
- Call(s_stringAsSpanMethod);
- Stloc(textSpanLocal);
+ Label originalDoneLabel = doneLabel;
+ doneLabel = DefineLabel();
- // for (int i = 0;
- Ldc(0);
- Stloc(iLocal);
- BrFar(checkSpanLengthLabel);
+ // We might loop any number of times. In order to ensure this loop
+ // and subsequent code sees textSpanPos the same regardless, we always need it to contain
+ // the same value, and the easiest such value is 0. So, we transfer
+ // textSpanPos to runtextpos, and ensure that any path out of here has
+ // textSpanPos as 0.
+ TransferTextSpanPosToRunTextPos();
- // if (CharInClass(span[i], "..."))
- MarkLabel(loopBody);
- Ldloca(textSpanLocal);
- Ldloc(iLocal);
- Call(s_spanGetItemMethod);
- LdindU2();
- EmitCallCharInClass(_fcPrefix.GetValueOrDefault().Prefix, _fcPrefix.GetValueOrDefault().CaseInsensitive, charInClassLocal);
- BrfalseFar(charNotInClassLabel);
+ Label conditionLabel = DefineLabel();
+ Label bodyLabel = DefineLabel();
- // runtextpos += i; return true;
- Ldthis();
- Ldthisfld(s_runtextposField);
- Ldloc(iLocal);
- Add();
- Stfld(s_runtextposField);
- Ldc(1);
- Ret();
+ Debug.Assert(node.N > node.M);
+ int minIterations = node.M;
+ int maxIterations = node.N;
- // for (...; ...; i++)
- MarkLabel(charNotInClassLabel);
- Ldloc(iLocal);
+ // int i = 0;
+ Ldc(0);
+ Stloc(iterationLocal);
+ BrFar(conditionLabel);
+
+ // Body:
+ // TimeoutCheck();
+ // if (!match) goto Done;
+ MarkLabel(bodyLabel);
+ EmitTimeoutCheck();
+
+ // Iteration body
+ Label successfulIterationLabel = DefineLabel();
+
+ Label prevDone = doneLabel;
+ doneLabel = DefineLabel();
+
+ // Save off runtextpos.
+ LocalBuilder startingRunTextPosLocal = RentInt32Local();
+ Ldloc(runtextposLocal);
+ Stloc(startingRunTextPosLocal);
+
+ // Emit the child.
+ Debug.Assert(textSpanPos == 0);
+ EmitNode(node.Child(0));
+ TransferTextSpanPosToRunTextPos(); // ensure textSpanPos remains 0
+ Br(successfulIterationLabel); // iteration succeeded
+
+ // If the generated code gets here, the iteration failed.
+ // Reset state, branch to done.
+ MarkLabel(doneLabel);
+ doneLabel = prevDone; // reset done label
+ Ldloc(startingRunTextPosLocal);
+ Stloc(runtextposLocal);
+ ReturnInt32Local(startingRunTextPosLocal);
+ BrFar(doneLabel);
+
+ // Successful iteration.
+ MarkLabel(successfulIterationLabel);
+
+ // i++;
+ Ldloc(iterationLocal);
Ldc(1);
Add();
- Stloc(iLocal);
+ Stloc(iterationLocal);
- // for (...; i < span.Length; ...);
- MarkLabel(checkSpanLengthLabel);
- Ldloc(iLocal);
- Ldloca(textSpanLocal);
- Call(s_spanGetLengthMethod);
- BltFar(loopBody);
+ // if (i >= maxIterations) goto doneLabel;
+ MarkLabel(conditionLabel);
+ if (maxIterations != int.MaxValue)
+ {
+ Ldloc(iterationLocal);
+ Ldc(maxIterations);
+ BltFar(bodyLabel);
+ }
+ else
+ {
+ BrFar(bodyLabel);
+ }
- // runtextpos = runtextend;
- Ldthis();
- Ldthisfld(s_runtextendField);
- Stfld(s_runtextposField);
+ // Done:
+ MarkLabel(doneLabel);
+ doneLabel = originalDoneLabel; // Restore the original done label
- // return false;
- MarkLabel(returnFalseLabel);
- Ldc(0);
- Ret();
- }
- }
+ // Check to ensure we've found at least min iterations.
+ if (minIterations > 0)
+ {
+ Ldloc(iterationLocal);
+ Ldc(minIterations);
+ BltFar(doneLabel);
+ }
- /// Generates a very simple method that sets the _trackcount field.
- protected void GenerateInitTrackCount()
- {
- Ldthis();
- Ldc(_trackcount);
- Stfld(s_runtrackcountField);
- Ret();
+ ReturnInt32Local(iterationLocal);
+ }
}
- /// Declares a local int.
- private LocalBuilder DeclareInt() => _ilg!.DeclareLocal(typeof(int));
-
- /// Declares a local CultureInfo.
- private LocalBuilder? DeclareCultureInfo() => _ilg!.DeclareLocal(typeof(CultureInfo)); // cache local variable to avoid unnecessary TLS
-
- /// Declares a local int[].
- private LocalBuilder DeclareIntArray() => _ilg!.DeclareLocal(typeof(int[]));
-
- /// Declares a local string.
- private LocalBuilder DeclareString() => _ilg!.DeclareLocal(typeof(string));
-
- private LocalBuilder DeclareReadOnlySpanChar() => _ilg!.DeclareLocal(typeof(ReadOnlySpan));
-
/// Generates the code for "RegexRunner.Go".
protected void GenerateGo()
{
+ // Generate backtrack-free code when we're dealing with simpler regexes.
+ if (TryGenerateNonBacktrackingGo(_code!.Tree.Root))
+ {
+ return;
+ }
+
+ // We're dealing with a regex more complicated that the fast-path non-backtracking
+ // implementation can handle. Do the full-fledged thing.
+
// declare some locals
- _runtextposLocal = DeclareInt();
+ _runtextposLocal = DeclareInt32();
_runtextLocal = DeclareString();
- _runtrackposLocal = DeclareInt();
- _runtrackLocal = DeclareIntArray();
- _runstackposLocal = DeclareInt();
- _runstackLocal = DeclareIntArray();
- _temp1Local = DeclareInt();
- _temp2Local = DeclareInt();
- _temp3Local = DeclareInt();
+ _runtrackposLocal = DeclareInt32();
+ _runtrackLocal = DeclareInt32Array();
+ _runstackposLocal = DeclareInt32();
+ _runstackLocal = DeclareInt32Array();
+ _temp1Local = DeclareInt32();
+ _temp2Local = DeclareInt32();
+ _temp3Local = DeclareInt32();
if (_hasTimeout)
{
- _loopTimeoutCounterLocal = DeclareInt();
+ _loopTimeoutCounterLocal = DeclareInt32();
}
- _runtextbegLocal = DeclareInt();
- _runtextendLocal = DeclareInt();
- _runtextstartLocal = DeclareInt();
-
- _cultureLocal = null;
- if (!_options.HasFlag(RegexOptions.CultureInvariant))
- {
- bool needsCulture = _options.HasFlag(RegexOptions.IgnoreCase);
- if (!needsCulture)
- {
- for (int codepos = 0; codepos < _codes!.Length; codepos += RegexCode.OpcodeSize(_codes[codepos]))
- {
- if ((_codes[codepos] & RegexCode.Ci) == RegexCode.Ci)
- {
- needsCulture = true;
- break;
- }
- }
- }
+ _runtextbegLocal = DeclareInt32();
+ _runtextendLocal = DeclareInt32();
- if (needsCulture)
- {
- _cultureLocal = DeclareCultureInfo();
- }
- }
+ InitializeCultureForGoIfNecessary();
// clear some tables
@@ -1320,29 +2605,37 @@ protected void GenerateGo()
// emit the code!
- // cache CultureInfo in local variable which saves excessive thread local storage accesses
- if (_cultureLocal != null)
- {
- InitLocalCultureInfo();
- }
-
GenerateForwardSection();
GenerateMiddleSection();
GenerateBacktrackSection();
}
-#if DEBUG
- /// Debug.WriteLine
- private static readonly MethodInfo? s_debugWriteLine = typeof(Debug).GetMethod("WriteLine", new Type[] { typeof(string) });
-
- /// Debug only: emit code to print out a message.
- private void Message(string str)
+ private void InitializeCultureForGoIfNecessary()
{
- Ldstr(str);
- Call(s_debugWriteLine!);
- }
+ _cultureLocal = null;
+ if ((_options & RegexOptions.CultureInvariant) == 0)
+ {
+ bool needsCulture = (_options & RegexOptions.IgnoreCase) != 0;
+ if (!needsCulture)
+ {
+ for (int codepos = 0; codepos < _codes!.Length; codepos += RegexCode.OpcodeSize(_codes[codepos]))
+ {
+ if ((_codes[codepos] & RegexCode.Ci) == RegexCode.Ci)
+ {
+ needsCulture = true;
+ break;
+ }
+ }
+ }
-#endif
+ if (needsCulture)
+ {
+ // cache CultureInfo in local variable which saves excessive thread local storage accesses
+ _cultureLocal = DeclareCultureInfo();
+ InitLocalCultureInfo();
+ }
+ }
+ }
///
/// The main translation function. It translates the logic for a single opcode at
@@ -1361,37 +2654,9 @@ private void GenerateOneCode()
{
#if DEBUG
if ((_options & RegexOptions.Debug) != 0)
- {
- Mvlocfld(_runtextposLocal!, s_runtextposField);
- Mvlocfld(_runtrackposLocal!, s_runtrackposField);
- Mvlocfld(_runstackposLocal!, s_runstackposField);
- Ldthis();
- Callvirt(s_dumpStateM);
-
- var sb = new StringBuilder();
- if (_backpos > 0)
- {
- sb.AppendFormat("{0:D6} ", _backpos);
- }
- else
- {
- sb.Append(" ");
- }
- sb.Append(_code!.OpcodeDescription(_codepos));
-
- if ((_regexopcode & RegexCode.Back) != 0)
- {
- sb.Append(" Back");
- }
-
- if ((_regexopcode & RegexCode.Back2) != 0)
- {
- sb.Append(" Back2");
- }
-
- Message(sb.ToString());
- }
+ DumpBacktracking();
#endif
+
LocalBuilder charInClassLocal;
// Before executing any RegEx code in the unrolled loop,
@@ -2241,10 +3506,7 @@ private void GenerateOneCode()
//: if (Str[i] != Rightcharnext())
//: break Backward;
{
- int i;
- string str;
-
- str = _strings![Operand(0)];
+ string str = _strings![Operand(0)];
Ldc(str.Length);
Ldloc(_runtextendLocal!);
@@ -2253,7 +3515,7 @@ private void GenerateOneCode()
BgtFar(_backtrack);
// unroll the string
- for (i = 0; i < str.Length; i++)
+ for (int i = 0; i < str.Length; i++)
{
Ldloc(_runtextLocal!);
Ldloc(_runtextposLocal!);
@@ -2289,10 +3551,7 @@ private void GenerateOneCode()
//: if (Str[--c] != Leftcharnext())
//: break Backward;
{
- int i;
- string str;
-
- str = _strings![Operand(0)];
+ string str = _strings![Operand(0)];
Ldc(str.Length);
Ldloc(_runtextposLocal!);
@@ -2301,7 +3560,7 @@ private void GenerateOneCode()
BgtFar(_backtrack);
// unroll the string
- for (i = str.Length; i > 0;)
+ for (int i = str.Length; i > 0;)
{
i--;
Ldloc(_runtextLocal!);
@@ -2509,10 +3768,7 @@ private void GenerateOneCode()
if (Code() == RegexCode.Setrep)
{
- if (_hasTimeout)
- {
- EmitTimeoutCheck();
- }
+ EmitTimeoutCheck();
EmitCallCharInClass(_strings![Operand(0)], IsCaseInsensitive(), charInClassLocal);
BrfalseFar(_backtrack);
}
@@ -2558,6 +3814,18 @@ private void GenerateOneCode()
case RegexCode.Oneloop | RegexCode.Ci | RegexCode.Rtl:
case RegexCode.Notoneloop | RegexCode.Ci | RegexCode.Rtl:
case RegexCode.Setloop | RegexCode.Ci | RegexCode.Rtl:
+ case RegexCode.Oneloopatomic:
+ case RegexCode.Notoneloopatomic:
+ case RegexCode.Setloopatomic:
+ case RegexCode.Oneloopatomic | RegexCode.Rtl:
+ case RegexCode.Notoneloopatomic | RegexCode.Rtl:
+ case RegexCode.Setloopatomic | RegexCode.Rtl:
+ case RegexCode.Oneloopatomic | RegexCode.Ci:
+ case RegexCode.Notoneloopatomic | RegexCode.Ci:
+ case RegexCode.Setloopatomic | RegexCode.Ci:
+ case RegexCode.Oneloopatomic | RegexCode.Ci | RegexCode.Rtl:
+ case RegexCode.Notoneloopatomic | RegexCode.Ci | RegexCode.Rtl:
+ case RegexCode.Setloopatomic | RegexCode.Ci | RegexCode.Rtl:
//: int c = Operand(1);
//: if (c > Rightchars())
//: c = Rightchars();
@@ -2620,7 +3888,7 @@ private void GenerateOneCode()
Dup();
Stloc(cLocal);
Ldc(0);
- if (Code() == RegexCode.Setloop)
+ if (Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic)
{
BleFar(l2);
}
@@ -2638,12 +3906,9 @@ private void GenerateOneCode()
Rightcharnext();
}
- if (Code() == RegexCode.Setloop)
+ if (Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic)
{
- if (_hasTimeout)
- {
- EmitTimeoutCheck();
- }
+ EmitTimeoutCheck();
EmitCallCharInClass(_strings![Operand(0)], IsCaseInsensitive(), charInClassLocal);
BrtrueFar(l1);
}
@@ -2655,12 +3920,13 @@ private void GenerateOneCode()
}
Ldc(Operand(0));
- if (Code() == RegexCode.Oneloop)
+ if (Code() == RegexCode.Oneloop || Code() == RegexCode.Oneloopatomic)
{
Beq(l1);
}
else
{
+ Debug.Assert(Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic);
Bne(l1);
}
}
@@ -2671,25 +3937,29 @@ private void GenerateOneCode()
Stloc(_runtextposLocal!);
MarkLabel(l2);
- Ldloc(lenLocal);
- Ldloc(cLocal);
- Ble(AdvanceLabel());
- ReadyPushTrack();
- Ldloc(lenLocal);
- Ldloc(cLocal);
- Sub();
- Ldc(1);
- Sub();
- DoPush();
+ if (Code() != RegexCode.Oneloopatomic && Code() != RegexCode.Notoneloopatomic && Code() != RegexCode.Setloopatomic)
+ {
+ Ldloc(lenLocal);
+ Ldloc(cLocal);
+ Ble(AdvanceLabel());
- ReadyPushTrack();
- Ldloc(_runtextposLocal!);
- Ldc(1);
- Sub(IsRightToLeft());
- DoPush();
+ ReadyPushTrack();
+ Ldloc(lenLocal);
+ Ldloc(cLocal);
+ Sub();
+ Ldc(1);
+ Sub();
+ DoPush();
- Track();
+ ReadyPushTrack();
+ Ldloc(_runtextposLocal!);
+ Ldc(1);
+ Sub(IsRightToLeft());
+ DoPush();
+
+ Track();
+ }
break;
}
@@ -2937,11 +4207,20 @@ private void EmitCallCharInClass(string charClass, bool caseInsensitive, LocalBu
charClass[RegexCharClass.CategoryLengthIndex] == 0 && // must not have any categories
charClass[RegexCharClass.SetStartIndex] < charClass[RegexCharClass.SetStartIndex + 1]) // valid range
{
- // (uint)ch - charClass[3] < charClass[4] - charClass[3]
- Ldc(charClass[RegexCharClass.SetStartIndex]);
- Sub();
- Ldc(charClass[RegexCharClass.SetStartIndex + 1] - charClass[RegexCharClass.SetStartIndex]);
- CltUn();
+ if (RegexCharClass.IsSingleton(charClass) || RegexCharClass.IsSingletonInverse(charClass))
+ {
+ // ch == charClass[3]
+ Ldc(charClass[3]);
+ Ceq();
+ }
+ else
+ {
+ // (uint)ch - charClass[3] < charClass[4] - charClass[3]
+ Ldc(charClass[RegexCharClass.SetStartIndex]);
+ Sub();
+ Ldc(charClass[RegexCharClass.SetStartIndex + 1] - charClass[RegexCharClass.SetStartIndex]);
+ CltUn();
+ }
// Negate the answer if the negation flag was set
if (RegexCharClass.IsNegated(charClass))
@@ -2967,13 +4246,15 @@ private void EmitCallCharInClass(string charClass, bool caseInsensitive, LocalBu
// endianness perspective because the compilation happens on the same machine
// that runs the compiled code. If that were to ever change, this would need
// to be revisited. String length is 8 chars == 16 bytes == 128 bits.
- string bitVectorString = string.Create(8, charClass, (dest, charClass) =>
+ string bitVectorString = string.Create(8, (charClass, invariant), (dest, state) =>
{
for (int i = 0; i < 128; i++)
{
char c = (char)i;
- if (RegexCharClass.CharInClass(c, charClass) ||
- (invariant && char.IsUpper(c) && RegexCharClass.CharInClass(char.ToLowerInvariant(c), charClass)))
+ bool isSet = state.invariant ?
+ RegexCharClass.CharInClass(char.ToLowerInvariant(c), state.charClass) :
+ RegexCharClass.CharInClass(c, state.charClass);
+ if (isSet)
{
dest[i >> 4] |= (char)(1 << (i & 0xF));
}
@@ -2988,12 +4269,7 @@ private void EmitCallCharInClass(string charClass, bool caseInsensitive, LocalBu
// 3. Evaluate CharInClass on all ~65K inputs. This is relatively expensive, impacting startup costs.
// We currently go with (2). We may sometimes generate a fallback when we don't need one, but the cost of
// doing so once in a while is minimal.
- bool asciiOnly =
- charClass.Length > RegexCharClass.SetStartIndex &&
- charClass[RegexCharClass.CategoryLengthIndex] == 0 && // if there are any categories, assume there's unicode
- charClass[RegexCharClass.SetLengthIndex] % 2 == 0 && // range limits must come in pairs
- !RegexCharClass.IsNegated(charClass) && // if there's negation, assume there's unicode
- !RegexCharClass.IsSubtraction(charClass); // if it's subtraction, assume there's unicode
+ bool asciiOnly = RegexCharClass.CanEasilyEnumerateSetContents(charClass);
if (asciiOnly)
{
for (int i = RegexCharClass.SetStartIndex; i < charClass.Length; i++)
@@ -3062,7 +4338,12 @@ private void EmitCallCharInClass(string charClass, bool caseInsensitive, LocalBu
/// Emits a timeout check.
private void EmitTimeoutCheck()
{
- Debug.Assert(_hasTimeout && _loopTimeoutCounterLocal != null);
+ if (!_hasTimeout)
+ {
+ return;
+ }
+
+ Debug.Assert(_loopTimeoutCounterLocal != null);
// Increment counter for each loop iteration.
Ldloc(_loopTimeoutCounterLocal);
@@ -3080,5 +4361,42 @@ private void EmitTimeoutCheck()
Callvirt(s_checkTimeoutMethod);
MarkLabel(label);
}
+
+#if DEBUG
+ /// Emit code to print out the current state of the runner.
+ [ExcludeFromCodeCoverage]
+ private void DumpBacktracking()
+ {
+ Mvlocfld(_runtextposLocal!, s_runtextposField);
+ Mvlocfld(_runtrackposLocal!, s_runtrackposField);
+ Mvlocfld(_runstackposLocal!, s_runstackposField);
+ Ldthis();
+ Callvirt(s_dumpStateM);
+
+ var sb = new StringBuilder();
+ if (_backpos > 0)
+ {
+ sb.AppendFormat("{0:D6} ", _backpos);
+ }
+ else
+ {
+ sb.Append(" ");
+ }
+ sb.Append(_code!.OpcodeDescription(_codepos));
+
+ if ((_regexopcode & RegexCode.Back) != 0)
+ {
+ sb.Append(" Back");
+ }
+
+ if ((_regexopcode & RegexCode.Back2) != 0)
+ {
+ sb.Append(" Back2");
+ }
+
+ Ldstr(sb.ToString());
+ Call(s_debugWriteLine!);
+ }
+#endif
}
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs
index b69a6e8c41934..ff958a4e151ad 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs
@@ -13,6 +13,7 @@
using System.Collections.Generic;
using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
using System.Globalization;
namespace System.Text.RegularExpressions
@@ -84,7 +85,7 @@ public static RegexPrefix Prefix(RegexTree tree)
while (true)
{
- switch (curNode.NType)
+ switch (curNode.Type)
{
case RegexNode.Concatenate:
if (curNode.ChildCount() > 0)
@@ -94,13 +95,14 @@ public static RegexPrefix Prefix(RegexTree tree)
}
break;
- case RegexNode.Greedy:
+ case RegexNode.Atomic:
case RegexNode.Capture:
curNode = curNode.Child(0);
concatNode = null;
continue;
case RegexNode.Oneloop:
+ case RegexNode.Oneloopatomic:
case RegexNode.Onelazy:
// In release, cutoff at a length to which we can still reasonably construct a string
@@ -165,7 +167,7 @@ public static int Anchors(RegexTree tree)
while (true)
{
- switch (curNode.NType)
+ switch (curNode.Type)
{
case RegexNode.Concatenate:
if (curNode.ChildCount() > 0)
@@ -175,7 +177,7 @@ public static int Anchors(RegexTree tree)
}
break;
- case RegexNode.Greedy:
+ case RegexNode.Atomic:
case RegexNode.Capture:
curNode = curNode.Child(0);
concatNode = null;
@@ -189,7 +191,7 @@ public static int Anchors(RegexTree tree)
case RegexNode.Start:
case RegexNode.EndZ:
case RegexNode.End:
- return result | AnchorFromType(curNode.NType);
+ return result | AnchorFromType(curNode.Type);
case RegexNode.Empty:
case RegexNode.Require:
@@ -225,6 +227,7 @@ private static int AnchorFromType(int type) =>
};
#if DEBUG
+ [ExcludeFromCodeCoverage]
public static string AnchorDescription(int anchors)
{
StringBuilder sb = new StringBuilder();
@@ -295,19 +298,20 @@ private RegexFC PopFC()
while (true)
{
- if (curNode.Children == null)
+ int curNodeChildCount = curNode.ChildCount();
+ if (curNodeChildCount == 0)
{
// This is a leaf node
- CalculateFC(curNode.NType, curNode, 0);
+ CalculateFC(curNode.Type, curNode, 0);
}
- else if (curChild < curNode.Children.Count && !_skipAllChildren)
+ else if (curChild < curNodeChildCount && !_skipAllChildren)
{
// This is an interior node, and we have more children to analyze
- CalculateFC(curNode.NType | BeforeChild, curNode, curChild);
+ CalculateFC(curNode.Type | BeforeChild, curNode, curChild);
if (!_skipchild)
{
- curNode = curNode.Children[curChild];
+ curNode = curNode.Child(curChild);
// this stack is how we get a depth first walk of the tree.
PushInt(curChild);
curChild = 0;
@@ -330,7 +334,7 @@ private RegexFC PopFC()
curChild = PopInt();
curNode = curNode.Next;
- CalculateFC(curNode!.NType | AfterChild, curNode, curChild);
+ CalculateFC(curNode!.Type | AfterChild, curNode, curChild);
if (_failed)
return null;
@@ -353,16 +357,8 @@ private RegexFC PopFC()
///
private void CalculateFC(int NodeType, RegexNode node, int CurIndex)
{
- bool ci = false;
- bool rtl = false;
-
- if (NodeType <= RegexNode.Ref)
- {
- if ((node.Options & RegexOptions.IgnoreCase) != 0)
- ci = true;
- if ((node.Options & RegexOptions.RightToLeft) != 0)
- rtl = true;
- }
+ bool ci = (node.Options & RegexOptions.IgnoreCase) != 0;
+ bool rtl = (node.Options & RegexOptions.RightToLeft) != 0;
switch (NodeType)
{
@@ -426,8 +422,8 @@ private void CalculateFC(int NodeType, RegexNode node, int CurIndex)
case RegexNode.Group | AfterChild:
case RegexNode.Capture | BeforeChild:
case RegexNode.Capture | AfterChild:
- case RegexNode.Greedy | BeforeChild:
- case RegexNode.Greedy | AfterChild:
+ case RegexNode.Atomic | BeforeChild:
+ case RegexNode.Atomic | AfterChild:
break;
case RegexNode.Require | BeforeChild:
@@ -446,11 +442,13 @@ private void CalculateFC(int NodeType, RegexNode node, int CurIndex)
break;
case RegexNode.Oneloop:
+ case RegexNode.Oneloopatomic:
case RegexNode.Onelazy:
PushFC(new RegexFC(node.Ch, false, node.M == 0, ci));
break;
case RegexNode.Notoneloop:
+ case RegexNode.Notoneloopatomic:
case RegexNode.Notonelazy:
PushFC(new RegexFC(node.Ch, true, node.M == 0, ci));
break;
@@ -469,6 +467,7 @@ private void CalculateFC(int NodeType, RegexNode node, int CurIndex)
break;
case RegexNode.Setloop:
+ case RegexNode.Setloopatomic:
case RegexNode.Setlazy:
PushFC(new RegexFC(node.Str!, node.M == 0, ci));
break;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs
index bcbd0ad812f3d..a02da76c76121 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs
@@ -6,6 +6,7 @@
// while consuming input.
using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.Runtime.CompilerServices;
@@ -451,71 +452,138 @@ protected override bool FindFirstChar()
// We now loop through looking for the first matching character. This is a hot loop, so we lift out as many
// branches as we can. Each operation requires knowing whether this is a) right-to-left vs left-to-right, and
- // b) case-sensitive vs case-insensitive. So, we split it all out into 4 loops, for each combination of these.
- // It's duplicated code, but it allows the inner loop to be much tighter than if everything were combined with
- // multiple branches on each operation. We can also then use spans to avoid bounds checks in at least the forward
- // iteration direction where the JIT is able to detect the pattern.
+ // b) case-sensitive vs case-insensitive, and c) a singleton or not. So, we split it all out into 8 loops, for
+ // each combination of these. It's duplicated code, but it allows the inner loop to be much tighter than if
+ // everything were combined with multiple branches on each operation. We can also then use spans to avoid bounds
+ // checks in at least the forward iteration direction where the JIT is able to detect the pattern.
- if (!_rightToLeft)
+ if (RegexCharClass.IsSingleton(set))
{
- ReadOnlySpan span = runtext.AsSpan(runtextpos, runtextend - runtextpos);
- if (!_caseInsensitive)
+ char ch = RegexCharClass.SingletonChar(set);
+
+ if (!_rightToLeft)
{
- // left-to-right, case-sensitive
- for (int i = 0; i < span.Length; i++)
+ ReadOnlySpan span = runtext.AsSpan(runtextpos, runtextend - runtextpos);
+ if (!_caseInsensitive)
{
- if (RegexCharClass.CharInClass(span[i], set, ref _code.FCPrefixAsciiLookup))
+ // singleton, left-to-right, case-sensitive
+ int i = runtext.AsSpan(runtextpos, runtextend - runtextpos).IndexOf(ch);
+ if (i >= 0)
{
runtextpos += i;
return true;
}
}
+ else
+ {
+ // singleton, left-to-right, case-insensitive
+ TextInfo ti = _culture.TextInfo;
+ for (int i = 0; i < span.Length; i++)
+ {
+ if (ch == ti.ToLower(span[i]))
+ {
+ runtextpos += i;
+ return true;
+ }
+ }
+ }
+
+ runtextpos = runtextend;
}
else
{
- // left-to-right, case-insensitive
- TextInfo ti = _culture.TextInfo;
- for (int i = 0; i < span.Length; i++)
+ if (!_caseInsensitive)
{
- if (RegexCharClass.CharInClass(ti.ToLower(span[i]), set, ref _code.FCPrefixAsciiLookup))
+ // singleton, right-to-left, case-sensitive
+ for (int i = runtextpos - 1; i >= runtextbeg; i--)
{
- runtextpos += i;
- return true;
+ if (ch == runtext![i])
+ {
+ runtextpos = i + 1;
+ return true;
+ }
+ }
+ }
+ else
+ {
+ // singleton, right-to-left, case-insensitive
+ TextInfo ti = _culture.TextInfo;
+ for (int i = runtextpos - 1; i >= runtextbeg; i--)
+ {
+ if (ch == ti.ToLower(runtext![i]))
+ {
+ runtextpos = i + 1;
+ return true;
+ }
}
}
- }
- runtextpos = runtextend;
+ runtextpos = runtextbeg;
+ }
}
else
{
- if (!_caseInsensitive)
+ if (!_rightToLeft)
{
- // right-to-left, case-sensitive
- for (int i = runtextpos - 1; i >= runtextbeg; i--)
+ ReadOnlySpan span = runtext.AsSpan(runtextpos, runtextend - runtextpos);
+ if (!_caseInsensitive)
{
- if (RegexCharClass.CharInClass(runtext![i], set, ref _code.FCPrefixAsciiLookup))
+ // set, left-to-right, case-sensitive
+ for (int i = 0; i < span.Length; i++)
{
- runtextpos = i + 1;
- return true;
+ if (RegexCharClass.CharInClass(span[i], set, ref _code.FCPrefixAsciiLookup))
+ {
+ runtextpos += i;
+ return true;
+ }
}
}
+ else
+ {
+ // set, left-to-right, case-insensitive
+ TextInfo ti = _culture.TextInfo;
+ for (int i = 0; i < span.Length; i++)
+ {
+ if (RegexCharClass.CharInClass(ti.ToLower(span[i]), set, ref _code.FCPrefixAsciiLookup))
+ {
+ runtextpos += i;
+ return true;
+ }
+ }
+ }
+
+ runtextpos = runtextend;
}
else
{
- // right-to-left, case-insensitive
- TextInfo ti = _culture.TextInfo;
- for (int i = runtextpos - 1; i >= runtextbeg; i--)
+ if (!_caseInsensitive)
{
- if (RegexCharClass.CharInClass(ti.ToLower(runtext![i]), set, ref _code.FCPrefixAsciiLookup))
+ // set, right-to-left, case-sensitive
+ for (int i = runtextpos - 1; i >= runtextbeg; i--)
{
- runtextpos = i + 1;
- return true;
+ if (RegexCharClass.CharInClass(runtext![i], set, ref _code.FCPrefixAsciiLookup))
+ {
+ runtextpos = i + 1;
+ return true;
+ }
+ }
+ }
+ else
+ {
+ // set, right-to-left, case-insensitive
+ TextInfo ti = _culture.TextInfo;
+ for (int i = runtextpos - 1; i >= runtextbeg; i--)
+ {
+ if (RegexCharClass.CharInClass(ti.ToLower(runtext![i]), set, ref _code.FCPrefixAsciiLookup))
+ {
+ runtextpos = i + 1;
+ return true;
+ }
}
}
- }
- runtextpos = runtextbeg;
+ runtextpos = runtextbeg;
+ }
}
return false;
@@ -1066,6 +1134,7 @@ protected override void Go()
}
case RegexCode.Oneloop:
+ case RegexCode.Oneloopatomic:
{
int c = Operand(1);
@@ -1085,14 +1154,17 @@ protected override void Go()
}
}
- if (c > i)
+ if (c > i && Operator() == RegexCode.Oneloop)
+ {
TrackPush(c - i - 1, Textpos() - Bump());
+ }
advance = 2;
continue;
}
case RegexCode.Notoneloop:
+ case RegexCode.Notoneloopatomic:
{
int c = Operand(1);
@@ -1112,14 +1184,17 @@ protected override void Go()
}
}
- if (c > i)
+ if (c > i && Operator() == RegexCode.Notoneloop)
+ {
TrackPush(c - i - 1, Textpos() - Bump());
+ }
advance = 2;
continue;
}
case RegexCode.Setloop:
+ case RegexCode.Setloopatomic:
{
int c = Operand(1);
@@ -1149,8 +1224,10 @@ protected override void Go()
}
}
- if (c > i)
+ if (c > i && Operator() == RegexCode.Setloop)
+ {
TrackPush(c - i - 1, Textpos() - Bump());
+ }
advance = 2;
continue;
@@ -1286,6 +1363,7 @@ protected override void Go()
}
#if DEBUG
+ [ExcludeFromCodeCoverage]
internal override void DumpState()
{
base.DumpState();
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs
index 3963cbc9a1f9d..a11d25f6147ca 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs
@@ -10,8 +10,8 @@ namespace System.Text.RegularExpressions
{
internal sealed class RegexLWCGCompiler : RegexCompiler
{
- private static int s_regexCount = 0;
private static readonly Type[] s_paramTypes = new Type[] { typeof(RegexRunner) };
+ private static int s_regexCount = 0;
/// The top-level driver. Initializes everything then calls the Generate* methods.
public RegexRunnerFactory FactoryInstanceFromCode(RegexCode code, RegexOptions options, bool hasTimeout)
@@ -32,16 +32,13 @@ public RegexRunnerFactory FactoryInstanceFromCode(RegexCode code, RegexOptions o
DynamicMethod goMethod = DefineDynamicMethod("Go" + regexnumString, null, typeof(CompiledRegexRunner));
GenerateGo();
- DynamicMethod firstCharMethod = DefineDynamicMethod("FindFirstChar" + regexnumString, typeof(bool), typeof(CompiledRegexRunner));
+ DynamicMethod findFirstCharMethod = DefineDynamicMethod("FindFirstChar" + regexnumString, typeof(bool), typeof(CompiledRegexRunner));
GenerateFindFirstChar();
- DynamicMethod trackCountMethod = DefineDynamicMethod("InitTrackCount" + regexnumString, null, typeof(CompiledRegexRunner));
+ DynamicMethod initTrackCountMethod = DefineDynamicMethod("InitTrackCount" + regexnumString, null, typeof(CompiledRegexRunner));
GenerateInitTrackCount();
- return new CompiledRegexRunnerFactory(
- (Action)goMethod.CreateDelegate(typeof(Action)),
- (Func)firstCharMethod.CreateDelegate(typeof(Func)),
- (Action)trackCountMethod.CreateDelegate(typeof(Action)));
+ return new CompiledRegexRunnerFactory(goMethod, findFirstCharMethod, initTrackCountMethod);
}
/// Begins the definition of a new method (no args) with a specified return value.
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
index e9e48724d05f3..dbd3339b5fe5c 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -41,6 +41,7 @@
using System.Collections.Generic;
using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
using System.Globalization;
namespace System.Text.RegularExpressions
@@ -77,6 +78,10 @@ internal sealed class RegexNode
public const int EndZ = RegexCode.EndZ; // \Z
public const int End = RegexCode.End; // \z
+ public const int Oneloopatomic = RegexCode.Oneloopatomic; // c,n (?> a*)
+ public const int Notoneloopatomic = RegexCode.Notoneloopatomic; // c,n (?> .*)
+ public const int Setloopatomic = RegexCode.Setloopatomic; // set,n (?> \d*)
+
// Interior nodes do not correspond to primitive operations, but
// control structures compositing other operations
@@ -95,49 +100,49 @@ internal sealed class RegexNode
public const int Group = 29; // (?:) - noncapturing group
public const int Require = 30; // (?=) (?<=) - lookahead and lookbehind assertions
public const int Prevent = 31; // (?!) (?) - greedy subexpression
+ public const int Atomic = 32; // (?>) - atomic subexpression
public const int Testref = 33; // (?(n) | ) - alternation, reference
public const int Testgroup = 34; // (?(...) | )- alternation, expression
- public int NType;
- public List? Children;
- public string? Str;
- public char Ch;
- public int M;
- public int N;
+ private object? Children;
+ public int Type { get; private set; }
+ public string? Str { get; private set; }
+ public char Ch { get; private set; }
+ public int M { get; private set; }
+ public int N { get; private set; }
public readonly RegexOptions Options;
public RegexNode? Next;
public RegexNode(int type, RegexOptions options)
{
- NType = type;
+ Type = type;
Options = options;
}
public RegexNode(int type, RegexOptions options, char ch)
{
- NType = type;
+ Type = type;
Options = options;
Ch = ch;
}
public RegexNode(int type, RegexOptions options, string str)
{
- NType = type;
+ Type = type;
Options = options;
Str = str;
}
public RegexNode(int type, RegexOptions options, int m)
{
- NType = type;
+ Type = type;
Options = options;
M = m;
}
public RegexNode(int type, RegexOptions options, int m, int n)
{
- NType = type;
+ Type = type;
Options = options;
M = m;
N = n;
@@ -150,9 +155,9 @@ public bool UseOptionR()
public RegexNode ReverseLeft()
{
- if (UseOptionR() && NType == Concatenate && Children != null)
+ if (UseOptionR() && Type == Concatenate && ChildCount() > 1)
{
- Children.Reverse(0, Children.Count);
+ ((List)Children!).Reverse();
}
return this;
@@ -163,11 +168,71 @@ public RegexNode ReverseLeft()
///
private void MakeRep(int type, int min, int max)
{
- NType += (type - One);
+ Type += (type - One);
M = min;
N = max;
}
+ /// Performs additional optimizations on an entire tree prior to being used.
+ internal RegexNode FinalOptimize()
+ {
+ RegexNode rootNode = this;
+
+ // If we find backtracking construct at the end of the regex, we can instead make it non-backtracking,
+ // since nothing would ever backtrack into it anyway. Doing this then makes the construct available
+ // to implementations that don't support backtracking.
+ if ((Options & RegexOptions.RightToLeft) == 0 && // only apply optimization when LTR to avoid needing additional code for the rarer RTL case
+ (Options & RegexOptions.Compiled) != 0) // only apply when we're compiling, as that's the only time it would make a meaningful difference
+ {
+ RegexNode node = rootNode;
+ while (true)
+ {
+ switch (node.Type)
+ {
+ case Oneloop:
+ node.Type = Oneloopatomic;
+ break;
+
+ case Notoneloop:
+ node.Type = Notoneloopatomic;
+ break;
+
+ case Setloop:
+ node.Type = Setloopatomic;
+ break;
+
+ case Capture:
+ case Concatenate:
+ RegexNode existingChild = node.Child(node.ChildCount() - 1);
+ switch (existingChild.Type)
+ {
+ default:
+ node = existingChild;
+ break;
+
+ case Alternate:
+ case Loop:
+ case Lazyloop:
+ var atomic = new RegexNode(Atomic, Options);
+ atomic.AddChild(existingChild);
+ node.ReplaceChild(node.ChildCount() - 1, atomic);
+ break;
+ }
+ continue;
+
+ case Atomic:
+ node = node.Child(0);
+ continue;
+ }
+
+ break;
+ }
+ }
+
+ // Done optimizing. Return the final tree.
+ return rootNode;
+ }
+
///
/// Removes redundant nodes from the subtree, and returns a reduced subtree.
///
@@ -175,7 +240,7 @@ private RegexNode Reduce()
{
RegexNode n;
- switch (Type())
+ switch (Type)
{
case Alternate:
n = ReduceAlternation();
@@ -187,7 +252,11 @@ private RegexNode Reduce()
case Loop:
case Lazyloop:
- n = ReduceRep();
+ n = ReduceLoops();
+ break;
+
+ case Atomic:
+ n = ReduceAtomic();
break;
case Group:
@@ -226,77 +295,176 @@ private RegexNode StripEnation(int emptyType) =>
///
private RegexNode ReduceGroup()
{
- RegexNode u;
+ RegexNode u = this;
- for (u = this; u.Type() == Group;)
+ while (u.Type == Group)
+ {
+ Debug.Assert(u.ChildCount() == 1);
u = u.Child(0);
+ }
return u;
}
///
- /// Nested repeaters just get multiplied with each other if they're not
- /// too lumpy
+ /// Simple optimization. If an atomic subexpression contains only a one/notone/set loop,
+ /// change it to be an atomic one/notone/set loop and remove the atomic node.
///
- private RegexNode ReduceRep()
+ private RegexNode ReduceAtomic()
+ {
+ Debug.Assert(Type == Atomic);
+ Debug.Assert(ChildCount() == 1);
+
+ RegexNode child = Child(0);
+ switch (child.Type)
+ {
+ case Oneloop:
+ child.Type = Oneloopatomic;
+ return child;
+
+ case Notoneloop:
+ child.Type = Notoneloopatomic;
+ return child;
+
+ case Setloop:
+ child.Type = Setloopatomic;
+ return child;
+
+ case Oneloopatomic:
+ case Notoneloopatomic:
+ case Setloopatomic:
+ return child;
+ }
+
+ return this;
+ }
+
+ ///
+ /// Nested repeaters just get multiplied with each other if they're not too lumpy.
+ /// Other optimizations may have also resulted in {Lazy}loops directly containing
+ /// sets, ones, and notones, in which case they can be transformed into the corresponding
+ /// individual looping constructs.
+ ///
+ private RegexNode ReduceLoops()
{
RegexNode u = this;
- RegexNode child;
- int type = Type();
+ int type = Type;
+ Debug.Assert(type == Loop || type == Lazyloop);
+
int min = M;
int max = N;
- while (true)
+ while (u.ChildCount() > 0)
{
- if (u.ChildCount() == 0)
- break;
-
- child = u.Child(0);
+ RegexNode child = u.Child(0);
// multiply reps of the same type only
- if (child.Type() != type)
+ if (child.Type != type)
{
- int childType = child.Type();
+ bool valid = false;
+ if (type == Loop)
+ {
+ switch (child.Type)
+ {
+ case Oneloop:
+ case Oneloopatomic:
+ case Notoneloop:
+ case Notoneloopatomic:
+ case Setloop:
+ case Setloopatomic:
+ valid = true;
+ break;
+ }
+ }
+ else // type == Lazyloop
+ {
+ switch (child.Type)
+ {
+ case Onelazy:
+ case Notonelazy:
+ case Setlazy:
+ valid = true;
+ break;
+ }
+ }
- if (!(childType >= Oneloop && childType <= Setloop && type == Loop ||
- childType >= Onelazy && childType <= Setlazy && type == Lazyloop))
+ if (!valid)
+ {
break;
+ }
}
// child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
// [but things like (a {2,})+ are not too lumpy...]
if (u.M == 0 && child.M > 1 || child.N < child.M * 2)
+ {
break;
+ }
u = child;
+
if (u.M > 0)
+ {
u.M = min = ((int.MaxValue - 1) / u.M < min) ? int.MaxValue : u.M * min;
+ }
+
if (u.N > 0)
+ {
u.N = max = ((int.MaxValue - 1) / u.N < max) ? int.MaxValue : u.N * max;
+ }
+ }
+
+ if (min == int.MaxValue)
+ {
+ return new RegexNode(Nothing, Options);
+ }
+
+ // If the Loop or Lazyloop now only has one child node and its a Set, One, or Notone,
+ // reduce to just Setloop/lazy, Oneloop/lazy, or Notoneloop/lazy. The parser will
+ // generally have only produced the latter, but other reductions could have exposed
+ // this.
+ if (u.ChildCount() == 1)
+ {
+ RegexNode child = u.Child(0);
+ switch (child.Type)
+ {
+ case One:
+ case Notone:
+ case Set:
+ child.MakeRep(u.Type == Lazyloop ? Onelazy : Oneloop, u.M, u.N);
+ u = child;
+ break;
+ }
}
- return min == int.MaxValue ? new RegexNode(Nothing, Options) : u;
+ return u;
}
///
- /// Simple optimization. If a set is an inverse singleton or empty, it's transformed accordingly.
+ /// Simple optimization. If a set is a singleton, an inverse singleton, or empty, it's transformed accordingly.
///
private RegexNode ReduceSet()
{
- // Extract empty-set, one and not-one case as special
+ // Extract empty-set, one, and not-one case as special
Debug.Assert(Str != null);
if (RegexCharClass.IsEmpty(Str))
{
- NType = Nothing;
+ Type = Nothing;
Str = null;
}
+ else if (RegexCharClass.IsSingleton(Str))
+ {
+ Ch = RegexCharClass.SingletonChar(Str);
+ Str = null;
+ Type += (One - Set);
+ }
else if (RegexCharClass.IsSingletonInverse(Str))
{
Ch = RegexCharClass.SingletonChar(Str);
Str = null;
- NType += (Notone - Set);
+ Type += (Notone - Set);
}
return this;
@@ -313,8 +481,16 @@ private RegexNode ReduceSet()
///
private RegexNode ReduceAlternation()
{
- if (Children == null)
+ int childCount = ChildCount();
+ if (childCount == 0)
+ {
return new RegexNode(Nothing, Options);
+ }
+
+ if (childCount == 1)
+ {
+ return Child(0);
+ }
bool wasLastSet = false;
bool lastNodeCannotMerge = false;
@@ -325,29 +501,40 @@ private RegexNode ReduceAlternation()
RegexNode at;
RegexNode prev;
- for (i = 0, j = 0; i < Children.Count; i++, j++)
+ List children = (List)Children!;
+ for (i = 0, j = 0; i < children.Count; i++, j++)
{
- at = Children[i];
+ at = children[i];
if (j < i)
- Children[j] = at;
+ children[j] = at;
while (true)
{
- if (at.NType == Alternate)
+ if (at.Type == Alternate)
{
- for (int k = 0; k < at.Children!.Count; k++)
- at.Children[k].Next = this;
-
- Children.InsertRange(i + 1, at.Children);
+ if (at.Children is List atChildren)
+ {
+ for (int k = 0; k < atChildren.Count; k++)
+ {
+ atChildren[k].Next = this;
+ }
+ children.InsertRange(i + 1, atChildren);
+ }
+ else
+ {
+ RegexNode atChild = (RegexNode)at.Children!;
+ atChild.Next = this;
+ children.Insert(i + 1, atChild);
+ }
j--;
}
- else if (at.NType == Set || at.NType == One)
+ else if (at.Type == Set || at.Type == One)
{
// Cannot merge sets if L or I options differ, or if either are negated.
optionsAt = at.Options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase);
- if (at.NType == Set)
+ if (at.Type == Set)
{
if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !RegexCharClass.IsMergeable(at.Str))
{
@@ -369,10 +556,10 @@ private RegexNode ReduceAlternation()
// The last node was a Set or a One, we're a Set or One and our options are the same.
// Merge the two nodes.
j--;
- prev = Children[j];
+ prev = children[j];
RegexCharClass prevCharClass;
- if (prev.NType == One)
+ if (prev.Type == One)
{
prevCharClass = new RegexCharClass();
prevCharClass.AddChar(prev.Ch);
@@ -382,7 +569,7 @@ private RegexNode ReduceAlternation()
prevCharClass = RegexCharClass.Parse(prev.Str!);
}
- if (at.NType == One)
+ if (at.Type == One)
{
prevCharClass.AddChar(at.Ch);
}
@@ -392,10 +579,10 @@ private RegexNode ReduceAlternation()
prevCharClass.AddCharClass(atCharClass);
}
- prev.NType = Set;
+ prev.Type = Set;
prev.Str = prevCharClass.ToStringClass();
}
- else if (at.NType == Nothing)
+ else if (at.Type == Nothing)
{
j--;
}
@@ -409,7 +596,9 @@ private RegexNode ReduceAlternation()
}
if (j < i)
- Children.RemoveRange(j, i - j);
+ {
+ children.RemoveRange(j, i - j);
+ }
return StripEnation(Nothing);
}
@@ -422,39 +611,54 @@ private RegexNode ReduceAlternation()
///
private RegexNode ReduceConcatenation()
{
- if (Children == null)
+ int childCount = ChildCount();
+ if (childCount == 0)
+ {
return new RegexNode(Empty, Options);
+ }
+
+ if (childCount == 1)
+ {
+ return Child(0);
+ }
bool wasLastString = false;
RegexOptions optionsLast = 0;
- RegexOptions optionsAt;
- int i;
- int j;
+ int i, j;
- for (i = 0, j = 0; i < Children.Count; i++, j++)
+ List children = (List)Children!;
+ for (i = 0, j = 0; i < children.Count; i++, j++)
{
- RegexNode at;
- RegexNode prev;
-
- at = Children[i];
+ RegexNode at = children[i];
if (j < i)
- Children[j] = at;
+ {
+ children[j] = at;
+ }
- if (at.NType == Concatenate &&
+ if (at.Type == Concatenate &&
((at.Options & RegexOptions.RightToLeft) == (Options & RegexOptions.RightToLeft)))
{
- for (int k = 0; k < at.Children!.Count; k++)
- at.Children[k].Next = this;
-
- Children.InsertRange(i + 1, at.Children);
+ if (at.Children is List atChildren)
+ {
+ for (int k = 0; k < atChildren.Count; k++)
+ {
+ atChildren[k].Next = this;
+ }
+ children.InsertRange(i + 1, atChildren);
+ }
+ else
+ {
+ RegexNode atChild = (RegexNode)at.Children!;
+ atChild.Next = this;
+ children.Insert(i + 1, atChild);
+ }
j--;
}
- else if (at.NType == Multi ||
- at.NType == One)
+ else if (at.Type == Multi || at.Type == One)
{
// Cannot merge strings if L or I options differ
- optionsAt = at.Options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase);
+ RegexOptions optionsAt = at.Options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase);
if (!wasLastString || optionsLast != optionsAt)
{
@@ -463,30 +667,24 @@ private RegexNode ReduceConcatenation()
continue;
}
- prev = Children[--j];
+ RegexNode prev = children[--j];
- if (prev.NType == One)
+ if (prev.Type == One)
{
- prev.NType = Multi;
+ prev.Type = Multi;
prev.Str = prev.Ch.ToString();
}
if ((optionsAt & RegexOptions.RightToLeft) == 0)
{
- if (at.NType == One)
- prev.Str += at.Ch.ToString();
- else
- prev.Str += at.Str;
+ prev.Str += (at.Type == One) ? at.Ch.ToString() : at.Str;
}
else
{
- if (at.NType == One)
- prev.Str = at.Ch.ToString() + prev.Str;
- else
- prev.Str = at.Str + prev.Str;
+ prev.Str = (at.Type == One) ? at.Ch.ToString() + prev.Str : at.Str + prev.Str;
}
}
- else if (at.NType == Empty)
+ else if (at.Type == Empty)
{
j--;
}
@@ -497,11 +695,152 @@ private RegexNode ReduceConcatenation()
}
if (j < i)
- Children.RemoveRange(j, i - j);
+ {
+ children.RemoveRange(j, i - j);
+ }
+
+ // Now try to convert as many loops as possible to be atomic to avoid unnecessary backtracking.
+ if ((Options & RegexOptions.RightToLeft) == 0)
+ {
+ ReduceConcatenateWithAutoAtomic();
+ }
+ // If the concatenation is now empty, return an empty node, or if it's got a single child, return that child.
+ // Otherwise, return this.
return StripEnation(Empty);
}
+ ///
+ /// Finds oneloop and setloop nodes in the concatenation that can be automatically upgraded
+ /// to oneloopatomic and setloopatomic nodes. Such changes avoid potential useless backtracking.
+ /// This looks for cases like A*B, where A and B are known to not overlap: in such cases,
+ /// we can effectively convert this to (?>A*)B.
+ ///
+ private void ReduceConcatenateWithAutoAtomic()
+ {
+ Debug.Assert(Type == Concatenate);
+ Debug.Assert((Options & RegexOptions.RightToLeft) == 0);
+ Debug.Assert(Children is List);
+
+ List children = (List)Children;
+ for (int i = 0; i < children.Count - 1; i++)
+ {
+ RegexNode node = children[i], subsequent = children[i + 1];
+
+ // Skip down the node past irrelevant capturing groups. We don't need to
+ // skip Groups, as they should have already been reduced away.
+ while (node.Type == Capture)
+ {
+ Debug.Assert(node.ChildCount() == 1);
+ node = node.Child(0);
+ }
+ Debug.Assert(node.Type != Group);
+
+ // Skip the successor down to the guaranteed next node.
+ while (subsequent.ChildCount() > 0)
+ {
+ Debug.Assert(subsequent.Type != Group);
+ switch (subsequent.Type)
+ {
+ case Capture:
+ case Atomic:
+ case Require:
+ case Concatenate:
+ case Loop when subsequent.M > 0:
+ case Lazyloop when subsequent.M > 0:
+ subsequent = subsequent.Child(0);
+ continue;
+ }
+
+ break;
+ }
+
+ // If the two nodes don't agree on case-insensitivity, don't try to optimize.
+ // If they're both case sensitive or both case insensitive, then their tokens
+ // will be comparable.
+ if ((node.Options & RegexOptions.IgnoreCase) != (subsequent.Options & RegexOptions.IgnoreCase))
+ {
+ continue;
+ }
+
+ // If this node is a one/notone/setloop, see if it overlaps with its successor in the concatenation.
+ // If it doesn't, then we can upgrade it to being a one/notone/setloopatomic.
+ // Doing so avoids unnecessary backtracking.
+ switch (node.Type)
+ {
+ case Oneloop:
+ switch (subsequent.Type)
+ {
+ case One when node.Ch != subsequent.Ch:
+ case Onelazy when subsequent.M > 0 && node.Ch != subsequent.Ch:
+ case Oneloop when subsequent.M > 0 && node.Ch != subsequent.Ch:
+ case Oneloopatomic when subsequent.M > 0 && node.Ch != subsequent.Ch:
+ case Notone when node.Ch == subsequent.Ch:
+ case Notonelazy when subsequent.M > 0 && node.Ch == subsequent.Ch:
+ case Notoneloop when subsequent.M > 0 && node.Ch == subsequent.Ch:
+ case Notoneloopatomic when subsequent.M > 0 && node.Ch == subsequent.Ch:
+ case Multi when node.Ch != subsequent.Str![0]:
+ case Set when !RegexCharClass.CharInClass(node.Ch, subsequent.Str!):
+ case Setlazy when subsequent.M > 0 && !RegexCharClass.CharInClass(node.Ch, subsequent.Str!):
+ case Setloop when subsequent.M > 0 && !RegexCharClass.CharInClass(node.Ch, subsequent.Str!):
+ case Setloopatomic when subsequent.M > 0 && !RegexCharClass.CharInClass(node.Ch, subsequent.Str!):
+ case End:
+ case EndZ when node.Ch != '\n':
+ case Eol when node.Ch != '\n':
+ case Boundary when RegexCharClass.IsWordChar(node.Ch):
+ case Nonboundary when !RegexCharClass.IsWordChar(node.Ch):
+ case ECMABoundary when RegexCharClass.IsECMAWordChar(node.Ch):
+ case NonECMABoundary when !RegexCharClass.IsECMAWordChar(node.Ch):
+ node.Type = Oneloopatomic;
+ break;
+ }
+ break;
+
+ case Notoneloop:
+ switch (subsequent.Type)
+ {
+ case One when node.Ch == subsequent.Ch:
+ case Onelazy when subsequent.M > 0 && node.Ch == subsequent.Ch:
+ case Oneloop when subsequent.M > 0 && node.Ch == subsequent.Ch:
+ case Oneloopatomic when subsequent.M > 0 && node.Ch == subsequent.Ch:
+ case Multi when node.Ch == subsequent.Str![0]:
+ case End:
+ node.Type = Notoneloopatomic;
+ break;
+ }
+ break;
+
+ case Setloop:
+ switch (subsequent.Type)
+ {
+ case One when !RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
+ case Onelazy when subsequent.M > 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
+ case Oneloop when subsequent.M > 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
+ case Oneloopatomic when subsequent.M > 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
+ case Notone when RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
+ case Notonelazy when subsequent.M > 0 && RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
+ case Notoneloop when subsequent.M > 0 && RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
+ case Notoneloopatomic when subsequent.M > 0 && RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
+ case Multi when !RegexCharClass.CharInClass(subsequent.Str![0], node.Str!):
+ case Set when !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
+ case Setlazy when subsequent.M > 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
+ case Setloop when subsequent.M > 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
+ case Setloopatomic when subsequent.M > 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
+ case End:
+ case EndZ when !RegexCharClass.CharInClass('\n', node.Str!):
+ case Eol when !RegexCharClass.CharInClass('\n', node.Str!):
+ case Boundary when node.Str == RegexCharClass.WordClass || node.Str == RegexCharClass.DigitClass: // TODO: Expand these with a more inclusive overlap check that considers categories
+ case Nonboundary when node.Str == RegexCharClass.NotWordClass || node.Str == RegexCharClass.NotDigitClass:
+ case ECMABoundary when node.Str == RegexCharClass.ECMAWordClass || node.Str == RegexCharClass.ECMADigitClass:
+ case NonECMABoundary when node.Str == RegexCharClass.NotECMAWordClass || node.Str == RegexCharClass.NotDigitClass:
+ node.Type = Setloopatomic;
+ break;
+ }
+ break;
+ }
+ }
+ }
+
public RegexNode MakeQuantifier(bool lazy, int min, int max)
{
if (min == 0 && max == 0)
@@ -510,7 +849,7 @@ public RegexNode MakeQuantifier(bool lazy, int min, int max)
if (min == 1 && max == 1)
return this;
- switch (NType)
+ switch (Type)
{
case One:
case Notone:
@@ -527,147 +866,201 @@ public RegexNode MakeQuantifier(bool lazy, int min, int max)
public void AddChild(RegexNode newChild)
{
- if (Children == null)
- Children = new List(4);
-
RegexNode reducedChild = newChild.Reduce();
- Children.Add(reducedChild);
reducedChild.Next = this;
+
+ if (Children is null)
+ {
+ Children = reducedChild;
+ }
+ else if (Children is RegexNode currentChild)
+ {
+ Children = new List() { currentChild, reducedChild };
+ }
+ else
+ {
+ ((List)Children).Add(reducedChild);
+ }
}
- public RegexNode Child(int i)
+ public void ReplaceChild(int index, RegexNode newChild)
{
- return Children![i];
+ Debug.Assert(Children != null);
+ Debug.Assert(index < ChildCount());
+
+ newChild.Next = this;
+ if (Children is RegexNode)
+ {
+ Children = newChild;
+ }
+ else
+ {
+ ((List)Children)[index] = newChild;
+ }
}
- public int ChildCount()
+ public RegexNode Child(int i)
{
- return Children == null ? 0 : Children.Count;
+ if (Children is RegexNode child)
+ {
+ return child;
+ }
+
+ return ((List)Children!)[i];
}
- public int Type()
+ public int ChildCount()
{
- return NType;
+ if (Children is null)
+ {
+ return 0;
+ }
+
+ if (Children is List children)
+ {
+ return children.Count;
+ }
+
+ Debug.Assert(Children is RegexNode);
+ return 1;
}
#if DEBUG
- private const string Space = " ";
- private static readonly string[] s_typeStr = new string[] {
- "Onerep", "Notonerep", "Setrep",
- "Oneloop", "Notoneloop", "Setloop",
- "Onelazy", "Notonelazy", "Setlazy",
- "One", "Notone", "Set",
- "Multi", "Ref",
- "Bol", "Eol", "Boundary", "Nonboundary",
- "ECMABoundary", "NonECMABoundary",
- "Beginning", "Start", "EndZ", "End",
- "Nothing", "Empty",
- "Alternate", "Concatenate",
- "Loop", "Lazyloop",
- "Capture", "Group", "Require", "Prevent", "Greedy",
- "Testref", "Testgroup"};
-
- private string Description()
+ [ExcludeFromCodeCoverage]
+ public string Description()
{
- StringBuilder ArgSb = new StringBuilder();
-
- ArgSb.Append(s_typeStr[NType]);
-
- if ((Options & RegexOptions.ExplicitCapture) != 0)
- ArgSb.Append("-C");
- if ((Options & RegexOptions.IgnoreCase) != 0)
- ArgSb.Append("-I");
- if ((Options & RegexOptions.RightToLeft) != 0)
- ArgSb.Append("-L");
- if ((Options & RegexOptions.Multiline) != 0)
- ArgSb.Append("-M");
- if ((Options & RegexOptions.Singleline) != 0)
- ArgSb.Append("-S");
- if ((Options & RegexOptions.IgnorePatternWhitespace) != 0)
- ArgSb.Append("-X");
- if ((Options & RegexOptions.ECMAScript) != 0)
- ArgSb.Append("-E");
-
- switch (NType)
+
+ string typeStr = Type switch
+ {
+ Oneloop => nameof(Oneloop),
+ Notoneloop => nameof(Notoneloop),
+ Setloop => nameof(Setloop),
+ Onelazy => nameof(Onelazy),
+ Notonelazy => nameof(Notonelazy),
+ Setlazy => nameof(Setlazy),
+ One => nameof(One),
+ Notone => nameof(Notone),
+ Set => nameof(Set),
+ Multi => nameof(Multi),
+ Ref => nameof(Ref),
+ Bol => nameof(Bol),
+ Eol => nameof(Eol),
+ Boundary => nameof(Boundary),
+ Nonboundary => nameof(Nonboundary),
+ ECMABoundary => nameof(ECMABoundary),
+ NonECMABoundary => nameof(NonECMABoundary),
+ Beginning => nameof(Beginning),
+ Start => nameof(Start),
+ EndZ => nameof(EndZ),
+ End => nameof(End),
+ Oneloopatomic => nameof(Oneloopatomic),
+ Notoneloopatomic => nameof(Notoneloopatomic),
+ Setloopatomic => nameof(Setloopatomic),
+ Nothing => nameof(Nothing),
+ Empty => nameof(Empty),
+ Lazyloop => nameof(Lazyloop),
+ Capture => nameof(Capture),
+ Group => nameof(Group),
+ Require => nameof(Require),
+ Prevent => nameof(Prevent),
+ Atomic => nameof(Atomic),
+ Testref => nameof(Testref),
+ Testgroup => nameof(Testgroup),
+ _ => "(unknown)"
+ };
+
+ var argSb = new StringBuilder().Append(typeStr);
+
+ if ((Options & RegexOptions.ExplicitCapture) != 0) argSb.Append("-C");
+ if ((Options & RegexOptions.IgnoreCase) != 0) argSb.Append("-I");
+ if ((Options & RegexOptions.RightToLeft) != 0) argSb.Append("-L");
+ if ((Options & RegexOptions.Multiline) != 0) argSb.Append("-M");
+ if ((Options & RegexOptions.Singleline) != 0) argSb.Append("-S");
+ if ((Options & RegexOptions.IgnorePatternWhitespace) != 0) argSb.Append("-X");
+ if ((Options & RegexOptions.ECMAScript) != 0) argSb.Append("-E");
+
+ switch (Type)
{
case Oneloop:
+ case Oneloopatomic:
case Notoneloop:
+ case Notoneloopatomic:
case Onelazy:
case Notonelazy:
case One:
case Notone:
- ArgSb.Append("(Ch = " + RegexCharClass.CharDescription(Ch) + ")");
+ argSb.Append("(Ch = " + RegexCharClass.CharDescription(Ch) + ")");
break;
case Capture:
- ArgSb.Append("(index = " + M.ToString(CultureInfo.InvariantCulture) + ", unindex = " + N.ToString(CultureInfo.InvariantCulture) + ")");
+ argSb.Append("(index = " + M.ToString(CultureInfo.InvariantCulture) + ", unindex = " + N.ToString(CultureInfo.InvariantCulture) + ")");
break;
case Ref:
case Testref:
- ArgSb.Append("(index = " + M.ToString(CultureInfo.InvariantCulture) + ")");
+ argSb.Append("(index = " + M.ToString(CultureInfo.InvariantCulture) + ")");
break;
case Multi:
- ArgSb.Append("(String = " + Str + ")");
+ argSb.Append("(String = " + Str + ")");
break;
case Set:
case Setloop:
+ case Setloopatomic:
case Setlazy:
- ArgSb.Append("(Set = " + RegexCharClass.SetDescription(Str!) + ")");
+ argSb.Append("(Set = " + RegexCharClass.SetDescription(Str!) + ")");
break;
}
- switch (NType)
+ switch (Type)
{
case Oneloop:
+ case Oneloopatomic:
case Notoneloop:
+ case Notoneloopatomic:
case Onelazy:
case Notonelazy:
case Setloop:
+ case Setloopatomic:
case Setlazy:
case Loop:
case Lazyloop:
- ArgSb.Append("(Min = " + M.ToString(CultureInfo.InvariantCulture) + ", Max = " + (N == int.MaxValue ? "inf" : Convert.ToString(N, CultureInfo.InvariantCulture)) + ")");
+ argSb.Append("(Min = " + M.ToString(CultureInfo.InvariantCulture) + ", Max = " + (N == int.MaxValue ? "inf" : Convert.ToString(N, CultureInfo.InvariantCulture)) + ")");
break;
}
- return ArgSb.ToString();
+ return argSb.ToString();
}
+ [ExcludeFromCodeCoverage]
public void Dump()
{
- List Stack = new List();
- RegexNode? CurNode;
- int CurChild;
-
- CurNode = this;
- CurChild = 0;
+ List stack = new List();
+ RegexNode? curNode = this;
+ int curChild = 0;
- Debug.WriteLine(CurNode.Description());
+ Debug.WriteLine(curNode.Description());
while (true)
{
- if (CurNode!.Children != null && CurChild < CurNode.Children.Count)
+ if (curChild < curNode!.ChildCount())
{
- Stack.Add(CurChild + 1);
- CurNode = CurNode.Children[CurChild];
- CurChild = 0;
-
- int Depth = Stack.Count;
- if (Depth > 32)
- Depth = 32;
+ stack.Add(curChild + 1);
+ curNode = curNode.Child(curChild);
+ curChild = 0;
- Debug.WriteLine(Space.Substring(0, Depth) + CurNode.Description());
+ Debug.WriteLine(new string(' ', stack.Count) + curNode.Description());
}
else
{
- if (Stack.Count == 0)
+ if (stack.Count == 0)
+ {
break;
+ }
- CurChild = Stack[Stack.Count - 1];
- Stack.RemoveAt(Stack.Count - 1);
- CurNode = CurNode.Next;
+ curChild = stack[stack.Count - 1];
+ stack.RemoveAt(stack.Count - 1);
+ curNode = curNode.Next;
}
}
}
#endif
- }
+ }
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParseException.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParseException.cs
index 0a723cd2bfdb7..6304b27d8e821 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParseException.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParseException.cs
@@ -9,7 +9,7 @@ namespace System.Text.RegularExpressions
[Serializable]
internal sealed class RegexParseException : ArgumentException
{
- private readonly RegexParseError _error;
+ private readonly RegexParseError _error; // tests access this via private reflection
///
/// The error that happened during parsing.
@@ -27,28 +27,14 @@ public RegexParseException(RegexParseError error, int offset, string message) :
Offset = offset;
}
- public RegexParseException() : base()
- {
- }
-
- public RegexParseException(string message) : base(message)
- {
- }
-
- public RegexParseException(string message, Exception inner) : base(message, inner)
- {
- }
-
- private RegexParseException(SerializationInfo info, StreamingContext context)
- : base(info, context)
+ private RegexParseException(SerializationInfo info, StreamingContext context) : base(info, context)
{
}
public override void GetObjectData(SerializationInfo info, StreamingContext context)
{
base.GetObjectData(info, context);
- // To maintain serialization support with netfx.
- info.SetType(typeof(ArgumentException));
+ info.SetType(typeof(ArgumentException)); // To maintain serialization support with netfx.
}
}
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs
index edfe8bcfc442c..393a468e7b095 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs
@@ -499,7 +499,7 @@ private RegexNode ScanRegex()
AddGroup();
- return Unit()!;
+ return Unit()!.FinalOptimize();
}
/*
@@ -802,8 +802,8 @@ private RegexNode ScanReplacement()
break;
case '>':
- // greedy subexpression
- nodeType = RegexNode.Greedy;
+ // atomic subexpression
+ nodeType = RegexNode.Atomic;
break;
case '\'':
@@ -1020,7 +1020,7 @@ private RegexNode ScanReplacement()
nodeType = RegexNode.Group;
// Disallow options in the children of a testgroup node
- if (_group!.NType != RegexNode.Testgroup)
+ if (_group!.Type != RegexNode.Testgroup)
{
ScanOptions();
}
@@ -2155,7 +2155,7 @@ private void PopGroup()
_stack = _group!.Next;
// The first () inside a Testgroup group goes directly to the group
- if (_group.Type() == RegexNode.Testgroup && _group.ChildCount() == 0)
+ if (_group.Type == RegexNode.Testgroup && _group.ChildCount() == 0)
{
if (_unit == null)
{
@@ -2183,7 +2183,7 @@ private void AddAlternate()
{
// The | parts inside a Testgroup group go directly to the group
- if (_group!.Type() == RegexNode.Testgroup || _group.Type() == RegexNode.Testref)
+ if (_group!.Type == RegexNode.Testgroup || _group.Type == RegexNode.Testref)
{
_group.AddChild(_concatenation!.ReverseLeft());
}
@@ -2257,11 +2257,11 @@ private void AddUnitType(int type)
/// Finish the current group (in response to a ')' or end)
private void AddGroup()
{
- if (_group!.Type() == RegexNode.Testgroup || _group.Type() == RegexNode.Testref)
+ if (_group!.Type == RegexNode.Testgroup || _group.Type == RegexNode.Testref)
{
_group.AddChild(_concatenation!.ReverseLeft());
- if (_group.Type() == RegexNode.Testref && _group.ChildCount() > 2 || _group.ChildCount() > 3)
+ if (_group.Type == RegexNode.Testref && _group.ChildCount() > 2 || _group.ChildCount() > 3)
{
throw MakeException(RegexParseError.TooManyAlternates, SR.TooManyAlternates);
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs
index e299bc9642734..88013dd6335b5 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs
@@ -30,7 +30,7 @@ internal sealed class RegexReplacement
///
public RegexReplacement(string rep, RegexNode concat, Hashtable _caps)
{
- if (concat.Type() != RegexNode.Concatenate)
+ if (concat.Type != RegexNode.Concatenate)
throw new ArgumentException(SR.ReplacementError);
Span vsbStack = stackalloc char[256];
@@ -42,7 +42,7 @@ public RegexReplacement(string rep, RegexNode concat, Hashtable _caps)
{
RegexNode child = concat.Child(i);
- switch (child.Type())
+ switch (child.Type)
{
case RegexNode.Multi:
vsb.Append(child.Str!);
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs
index db3d8b4c82b43..6f2bb75f97b69 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs
@@ -15,6 +15,7 @@
// backtracked results from) the Match instance.
using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
using System.Globalization;
namespace System.Text.RegularExpressions
@@ -547,6 +548,7 @@ protected int MatchLength(int cap)
///
/// Dump the current state
///
+ [ExcludeFromCodeCoverage]
internal virtual void DumpState()
{
Debug.WriteLine("Text: " + TextposDescription());
@@ -554,6 +556,7 @@ internal virtual void DumpState()
Debug.WriteLine("Stack: " + StackDescription(runstack!, runstackpos));
}
+ [ExcludeFromCodeCoverage]
private static string StackDescription(int[] a, int index)
{
var sb = new StringBuilder();
@@ -579,6 +582,7 @@ private static string StackDescription(int[] a, int index)
return sb.ToString();
}
+ [ExcludeFromCodeCoverage]
internal virtual string TextposDescription()
{
var sb = new StringBuilder();
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTree.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTree.cs
index 09c29b91cc7b6..e0ddabe1afc6f 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTree.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTree.cs
@@ -6,6 +6,7 @@
// global information attached.
using System.Collections;
+using System.Diagnostics.CodeAnalysis;
namespace System.Text.RegularExpressions
{
@@ -31,18 +32,11 @@ internal RegexTree(RegexNode root, Hashtable caps, int[] capNumList, int capTop,
}
#if DEBUG
- public void Dump()
- {
- Root.Dump();
- }
+ [ExcludeFromCodeCoverage]
+ public void Dump() => Root.Dump();
- public bool Debug
- {
- get
- {
- return (Options & RegexOptions.Debug) != 0;
- }
- }
+ [ExcludeFromCodeCoverage]
+ public bool Debug => (Options & RegexOptions.Debug) != 0;
#endif
}
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs
index 917379bfed9e4..7c71cf5ef9378 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs
@@ -28,8 +28,7 @@ internal ref struct RegexWriter
private ValueListBuilder _emitted;
private ValueListBuilder _intStack;
- private readonly Dictionary _stringHash;
- private readonly List _stringTable;
+ private readonly Dictionary _stringTable;
private Hashtable? _caps;
private int _trackCount;
@@ -37,8 +36,7 @@ private RegexWriter(Span emittedSpan, Span intStackSpan)
{
_emitted = new ValueListBuilder(emittedSpan);
_intStack = new ValueListBuilder(intStackSpan);
- _stringHash = new Dictionary();
- _stringTable = new List();
+ _stringTable = new Dictionary();
_caps = null;
_trackCount = 0;
}
@@ -102,15 +100,16 @@ public RegexCode RegexCodeFromRegexTree(RegexTree tree)
while (true)
{
- if (curNode.Children == null)
+ int curNodeChildCount = curNode!.ChildCount();
+ if (curNodeChildCount == 0)
{
- EmitFragment(curNode.NType, curNode, 0);
+ EmitFragment(curNode.Type, curNode, 0);
}
- else if (curChild < curNode.Children.Count)
+ else if (curChild < curNodeChildCount)
{
- EmitFragment(curNode.NType | BeforeChild, curNode, curChild);
+ EmitFragment(curNode.Type | BeforeChild, curNode, curChild);
- curNode = curNode.Children[curChild];
+ curNode = curNode.Child(curChild);
_intStack.Append(curChild);
curChild = 0;
continue;
@@ -122,7 +121,7 @@ public RegexCode RegexCodeFromRegexTree(RegexTree tree)
curChild = _intStack.Pop();
curNode = curNode.Next;
- EmitFragment(curNode!.NType | AfterChild, curNode, curChild);
+ EmitFragment(curNode!.Type | AfterChild, curNode, curChild);
curChild++;
}
@@ -131,20 +130,26 @@ public RegexCode RegexCodeFromRegexTree(RegexTree tree)
RegexPrefix? fcPrefix = RegexFCD.FirstChars(tree);
RegexPrefix prefix = RegexFCD.Prefix(tree);
- bool rtl = ((tree.Options & RegexOptions.RightToLeft) != 0);
+ bool rtl = (tree.Options & RegexOptions.RightToLeft) != 0;
CultureInfo culture = (tree.Options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
- RegexBoyerMoore? bmPrefix;
- if (prefix.Prefix.Length > 0)
+ RegexBoyerMoore? bmPrefix = null;
+ if (prefix.Prefix.Length > 1) // if it's == 1, we're better off using fcPrefix
+ {
bmPrefix = new RegexBoyerMoore(prefix.Prefix, prefix.CaseInsensitive, rtl, culture);
- else
- bmPrefix = null;
+ }
int anchors = RegexFCD.Anchors(tree);
int[] emitted = _emitted.AsSpan().ToArray();
- return new RegexCode(emitted, _stringTable, _trackCount, _caps, capsize, bmPrefix, fcPrefix, anchors, rtl);
+ var strings = new string[_stringTable.Count];
+ foreach (KeyValuePair stringEntry in _stringTable)
+ {
+ strings[stringEntry.Value] = stringEntry.Key;
+ }
+
+ return new RegexCode(tree, emitted, strings, _trackCount, _caps, capsize, bmPrefix, fcPrefix, anchors, rtl);
}
///
@@ -198,17 +203,12 @@ private void Emit(int op, int opd1, int opd2)
/// Returns an index in the string table for a string;
/// uses a hashtable to eliminate duplicates.
///
- private int StringCode(string? str)
+ private int StringCode(string str)
{
- if (str == null)
- str = string.Empty;
-
- int i;
- if (!_stringHash.TryGetValue(str, out i))
+ if (!_stringTable.TryGetValue(str, out int i))
{
i = _stringTable.Count;
- _stringHash[str] = i;
- _stringTable.Add(str);
+ _stringTable.Add(str, i);
}
return i;
@@ -239,13 +239,13 @@ private int MapCapnum(int capnum)
private void EmitFragment(int nodetype, RegexNode node, int curIndex)
{
int bits = 0;
-
- if (nodetype <= RegexNode.Ref)
+ if (node.UseOptionR())
{
- if (node.UseOptionR())
- bits |= RegexCode.Rtl;
- if ((node.Options & RegexOptions.IgnoreCase) != 0)
- bits |= RegexCode.Ci;
+ bits |= RegexCode.Rtl;
+ }
+ if ((node.Options & RegexOptions.IgnoreCase) != 0)
+ {
+ bits |= RegexCode.Ci;
}
switch (nodetype)
@@ -256,7 +256,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex)
break;
case RegexNode.Alternate | BeforeChild:
- if (curIndex < node.Children!.Count - 1)
+ if (curIndex < node.ChildCount() - 1)
{
_intStack.Append(_emitted.Length);
Emit(RegexCode.Lazybranch, 0);
@@ -265,7 +265,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex)
case RegexNode.Alternate | AfterChild:
{
- if (curIndex < node.Children!.Count - 1)
+ if (curIndex < node.ChildCount() - 1)
{
int LBPos = _intStack.Pop();
_intStack.Append(_emitted.Length);
@@ -306,7 +306,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex)
Emit(RegexCode.Goto, 0);
PatchJump(Branchpos, _emitted.Length);
Emit(RegexCode.Forejump);
- if (node.Children!.Count > 1)
+ if (node.ChildCount() > 1)
break;
// else fallthrough
goto case 1;
@@ -344,7 +344,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex)
Emit(RegexCode.Getmark);
Emit(RegexCode.Forejump);
- if (node.Children!.Count > 2)
+ if (node.ChildCount() > 2)
break;
// else fallthrough
goto case 2;
@@ -428,50 +428,62 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex)
Emit(RegexCode.Forejump);
break;
- case RegexNode.Greedy | BeforeChild:
+ case RegexNode.Atomic | BeforeChild:
Emit(RegexCode.Setjump);
break;
- case RegexNode.Greedy | AfterChild:
+ case RegexNode.Atomic | AfterChild:
Emit(RegexCode.Forejump);
break;
case RegexNode.One:
case RegexNode.Notone:
- Emit(node.NType | bits, node.Ch);
+ Emit(node.Type | bits, node.Ch);
break;
case RegexNode.Notoneloop:
+ case RegexNode.Notoneloopatomic:
case RegexNode.Notonelazy:
case RegexNode.Oneloop:
+ case RegexNode.Oneloopatomic:
case RegexNode.Onelazy:
if (node.M > 0)
- Emit(((node.NType == RegexNode.Oneloop || node.NType == RegexNode.Onelazy) ?
+ {
+ Emit(((node.Type == RegexNode.Oneloop || node.Type == RegexNode.Oneloopatomic || node.Type == RegexNode.Onelazy) ?
RegexCode.Onerep : RegexCode.Notonerep) | bits, node.Ch, node.M);
+ }
if (node.N > node.M)
- Emit(node.NType | bits, node.Ch, node.N == int.MaxValue ?
- int.MaxValue : node.N - node.M);
+ {
+ Emit(node.Type | bits, node.Ch, node.N == int.MaxValue ? int.MaxValue : node.N - node.M);
+ }
break;
case RegexNode.Setloop:
+ case RegexNode.Setloopatomic:
case RegexNode.Setlazy:
- if (node.M > 0)
- Emit(RegexCode.Setrep | bits, StringCode(node.Str), node.M);
- if (node.N > node.M)
- Emit(node.NType | bits, StringCode(node.Str),
- (node.N == int.MaxValue) ? int.MaxValue : node.N - node.M);
+ {
+ int stringCode = StringCode(node.Str!);
+ if (node.M > 0)
+ {
+ Emit(RegexCode.Setrep | bits, stringCode, node.M);
+ }
+ if (node.N > node.M)
+ {
+ Emit(node.Type | bits, stringCode, (node.N == int.MaxValue) ? int.MaxValue : node.N - node.M);
+ }
+ }
break;
case RegexNode.Multi:
- Emit(node.NType | bits, StringCode(node.Str));
+ Emit(node.Type | bits, StringCode(node.Str!));
break;
case RegexNode.Set:
- Emit(node.NType | bits, StringCode(node.Str));
+ Emit(node.Type | bits, StringCode(node.Str!));
break;
case RegexNode.Ref:
- Emit(node.NType | bits, MapCapnum(node.M));
+ Emit(node.Type | bits, MapCapnum(node.M));
break;
case RegexNode.Nothing:
@@ -485,7 +497,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex)
case RegexNode.Start:
case RegexNode.EndZ:
case RegexNode.End:
- Emit(node.NType);
+ Emit(node.Type);
break;
default:
diff --git a/src/libraries/System.Text.RegularExpressions/tests/Configurations.props b/src/libraries/System.Text.RegularExpressions/tests/Configurations.props
index b6e0d3ce6383b..3e0106dbfb5ca 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/Configurations.props
+++ b/src/libraries/System.Text.RegularExpressions/tests/Configurations.props
@@ -2,6 +2,7 @@
$(NetCoreAppCurrent);
+ $(NetFrameworkCurrent);
\ No newline at end of file
diff --git a/src/libraries/System.Text.RegularExpressions/tests/MonoRegexTests.cs b/src/libraries/System.Text.RegularExpressions/tests/MonoRegexTests.cs
new file mode 100644
index 0000000000000..76b508ca87965
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/tests/MonoRegexTests.cs
@@ -0,0 +1,1141 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// See the LICENSE file in the project root for more information.
+
+// assembly: System_test
+// namespace: MonoTests.System.Text.RegularExpressions
+// file: PerlTrials.cs
+//
+// author: Dan Lewis (dlewis@gmx.co.uk)
+// (c) 2002
+
+using System.Collections.Generic;
+using System.Diagnostics;
+using Xunit;
+
+namespace System.Text.RegularExpressions.Tests
+{
+ public class MonoTests
+ {
+ // Ported from https://github.com/mono/mono/blob/0f2995e95e98e082c7c7039e17175cf2c6a00034/mcs/class/System/Test/System.Text.RegularExpressions/PerlTrials.cs
+ // Which in turn ported from perl-5.6.1/t/op/re_tests
+
+ [Theory]
+ [MemberData(nameof(RegexTestCasesWithOptions))]
+ public void ValidateRegex(string pattern, RegexOptions options, string input, string expected)
+ {
+ string result;
+ try
+ {
+ var re = new Regex(pattern, options);
+ int[] groupNums = re.GetGroupNumbers();
+ Match m = re.Match(input);
+
+ if (m.Success)
+ {
+ result = "Pass.";
+ for (int i = 0; i < m.Groups.Count; ++i)
+ {
+ int gid = groupNums[i];
+ Group group = m.Groups[gid];
+
+ result += $" Group[{gid}]=";
+ foreach (Capture cap in group.Captures)
+ {
+ result += $"({cap.Index},{cap.Length})";
+ }
+ }
+ }
+ else
+ {
+ result = "Fail.";
+ }
+ }
+ catch
+ {
+ result = "Error.";
+ }
+
+ Assert.Equal(expected, result);
+ }
+
+ public static IEnumerable
" +
+ "" +
+ "MSDN Home Page" +
+ "" +
+ "Microsoft Corporation Home Page" +
+ "" +
+ ".NET Base Class Library blog";
+
+ Match m = Regex.Match(InputString, HrefPattern, options | RegexOptions.IgnoreCase);
+ Assert.True(m.Success);
+ Assert.Equal("http://msdn2.microsoft.com", m.Groups[1].ToString());
+ Assert.Equal(43, m.Groups[1].Index);
+
+ m = m.NextMatch();
+ Assert.True(m.Success);
+ Assert.Equal("http://www.microsoft.com", m.Groups[1].ToString());
+ Assert.Equal(102, m.Groups[1].Index);
+
+ m = m.NextMatch();
+ Assert.True(m.Success);
+ Assert.Equal("http://blogs.msdn.com/bclteam", m.Groups[1].ToString());
+ Assert.Equal(176, m.Groups[1].Index);
+
+ m = m.NextMatch();
+ Assert.False(m.Success);
+ }
+
+ [Theory]
+ [InlineData(RegexOptions.None)]
+ [InlineData(RegexOptions.Compiled)]
+ public void MDYtoDMY(RegexOptions options)
+ {
+ string dt = new DateTime(2020, 1, 8, 0, 0, 0, DateTimeKind.Utc).ToString("d", DateTimeFormatInfo.InvariantInfo);
+ string result = Regex.Replace(dt, @"\b(?\d{1,2})/(?\d{1,2})/(?\d{2,4})\b", "${day}-${month}-${year}", options);
+ Assert.Equal("08-01-2020", result);
+ }
+
+ [Theory]
+ [InlineData(RegexOptions.None)]
+ [InlineData(RegexOptions.Compiled)]
+ public void ExtractProtocolPort(RegexOptions options)
+ {
+ string url = "http://www.contoso.com:8080/letters/readme.html";
+ Regex r = new Regex(@"^(?\w+)://[^/]+?(?:\d+)?/", options);
+ Match m = r.Match(url);
+ Assert.True(m.Success);
+ Assert.Equal("http:8080", m.Result("${proto}${port}"));
+ }
+
+ [Theory]
+ [InlineData("david.jones@proseware.com", true)]
+ [InlineData("d.j@server1.proseware.com", true)]
+ [InlineData("jones@ms1.proseware.com", true)]
+ [InlineData("j.@server1.proseware.com", false)]
+ [InlineData("j@proseware.com9", true)]
+ [InlineData("js#internal@proseware.com", true)]
+ [InlineData("j_9@[129.126.118.1]", true)]
+ [InlineData("j..s@proseware.com", false)]
+ [InlineData("js*@proseware.com", false)]
+ [InlineData("js@proseware..com", false)]
+ [InlineData("js@proseware.com9", true)]
+ [InlineData("j.s@server1.proseware.com", true)]
+ [InlineData("\"j\\\"s\\\"\"@proseware.com", true)]
+ [InlineData("js@contoso.\u4E2D\u56FD", true)]
+ public void ValidateEmail(string email, bool expectedIsValid)
+ {
+ Assert.Equal(expectedIsValid, IsValidEmail(email, RegexOptions.None));
+ Assert.Equal(expectedIsValid, IsValidEmail(email, RegexOptions.Compiled));
+
+ bool IsValidEmail(string email, RegexOptions options)
+ {
+ if (string.IsNullOrWhiteSpace(email))
+ {
+ return false;
+ }
+
+ try
+ {
+ // Normalize the domain
+ email = Regex.Replace(email, @"(@)(.+)$", DomainMapper, options, TimeSpan.FromMilliseconds(200));
+
+ // Examines the domain part of the email and normalizes it.
+ string DomainMapper(Match match)
+ {
+ // Use IdnMapping class to convert Unicode domain names.
+ var idn = new IdnMapping();
+
+ // Pull out and process domain name (throws ArgumentException on invalid)
+ string domainName = idn.GetAscii(match.Groups[2].Value);
+
+ return match.Groups[1].Value + domainName;
+ }
+ }
+ catch (RegexMatchTimeoutException)
+ {
+ return false;
+ }
+ catch (ArgumentException)
+ {
+ return false;
+ }
+
+ try
+ {
+ return Regex.IsMatch(email,
+ @"^(?("")("".+?(? Match_Basic_TestData()
// The last 3 causes the match to fail, since the non backtracking subexpression does not give up the last digit it matched
// for it to be a success. For a correct match, remove the last character, '3' from the pattern
yield return new object[] { "[^0-9]+(?>[0-9]+)3", "abc123", RegexOptions.None, 0, 6, false, string.Empty };
+ yield return new object[] { "[^0-9]+(?>[0-9]+)", "abc123", RegexOptions.None, 0, 6, true, "abc123" };
+
+ // More nonbacktracking expressions
+ foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.IgnoreCase })
+ {
+ string Case(string s) => (options & RegexOptions.IgnoreCase) != 0 ? s.ToUpper() : s;
+
+ yield return new object[] { Case("(?>[0-9]+)abc"), "abc12345abc", options, 3, 8, true, "12345abc" };
+ yield return new object[] { Case("(?>(?>[0-9]+))abc"), "abc12345abc", options, 3, 8, true, "12345abc" };
+ yield return new object[] { Case("(?>[0-9]*)abc"), "abc12345abc", options, 3, 8, true, "12345abc" };
+ yield return new object[] { Case("(?>[^z]+)z"), "zzzzxyxyxyz123", options, 4, 9, true, "xyxyxyz" };
+ yield return new object[] { Case("(?>(?>[^z]+))z"), "zzzzxyxyxyz123", options, 4, 9, true, "xyxyxyz" };
+ yield return new object[] { Case("(?>[^z]*)z123"), "zzzzxyxyxyz123", options, 4, 10, true, "xyxyxyz123" };
+ yield return new object[] { Case("(?>a+)123"), "aa1234", options, 0, 5, true, "aa123" };
+ yield return new object[] { Case("(?>a*)123"), "aa1234", options, 0, 5, true, "aa123" };
+ yield return new object[] { Case("(?>(?>a*))123"), "aa1234", options, 0, 5, true, "aa123" };
+ yield return new object[] { Case("(?>a+?)a"), "aaaaa", options, 0, 2, true, "aa" };
+ yield return new object[] { Case("(?>a*?)a"), "aaaaa", options, 0, 1, true, "a" };
+ yield return new object[] { Case("(?>hi|hello|hey)hi"), "hellohi", options, 0, 0, false, string.Empty };
+ yield return new object[] { Case("(?:hi|hello|hey)hi"), "hellohi", options, 0, 7, true, "hellohi" }; // allow backtracking and it succeeds
+ yield return new object[] { Case("(?>hi|hello|hey)hi"), "hihi", options, 0, 4, true, "hihi" };
+ }
// Using beginning/end of string chars \A, \Z: Actual - "\\Aaaa\\w+zzz\\Z"
yield return new object[] { @"\Aaaa\w+zzz\Z", "aaaasdfajsdlfjzzz", RegexOptions.IgnoreCase, 0, 17, true, "aaaasdfajsdlfjzzz" };
@@ -83,6 +105,12 @@ public static IEnumerable Match_Basic_TestData()
yield return new object[] { @"\Aaaaaa\w+zzz\Z", "aaaa", RegexOptions.RightToLeft, 0, 4, false, string.Empty };
yield return new object[] { @"\Aaaaaa\w+zzzzz\Z", "aaaa", RegexOptions.RightToLeft, 0, 4, false, string.Empty };
yield return new object[] { @"\Aaaaaa\w+zzz\Z", "aaaa", RegexOptions.RightToLeft | RegexOptions.IgnoreCase, 0, 4, false, string.Empty };
+ yield return new object[] { @"abc\Adef", "abcdef", RegexOptions.None, 0, 0, false, string.Empty };
+ yield return new object[] { @"abc\adef", "abcdef", RegexOptions.None, 0, 0, false, string.Empty };
+ yield return new object[] { @"abc\Gdef", "abcdef", RegexOptions.None, 0, 0, false, string.Empty };
+ yield return new object[] { @"abc^def", "abcdef", RegexOptions.None, 0, 0, false, string.Empty };
+ yield return new object[] { @"abc\Zef", "abcdef", RegexOptions.None, 0, 0, false, string.Empty };
+ yield return new object[] { @"abc\zef", "abcdef", RegexOptions.None, 0, 0, false, string.Empty };
// Using beginning/end of string chars \A, \Z: Actual - "\\Aaaa\\w+zzz\\Z"
yield return new object[] { @"\Aaaa\w+zzz\Z", "aaaasdfajsdlfjzzza", RegexOptions.None, 0, 18, false, string.Empty };
@@ -295,7 +323,10 @@ public static IEnumerable Match_Basic_TestData()
yield return new object[] { @"[a-[a-f]]", "abcdefghijklmnopqrstuvwxyz", RegexOptions.None, 0, 26, false, string.Empty };
// \c
- yield return new object[] { @"(cat)(\c[*)(dog)", "asdlkcat\u00FFdogiwod", RegexOptions.None, 0, 15, false, string.Empty };
+ if (!PlatformDetection.IsFullFramework) // missing fix for #26501
+ {
+ yield return new object[] { @"(cat)(\c[*)(dog)", "asdlkcat\u00FFdogiwod", RegexOptions.None, 0, 15, false, string.Empty };
+ }
// Surrogate pairs splitted up into UTF-16 code units.
yield return new object[] { @"(\uD82F[\uDCA0-\uDCA3])", "\uD82F\uDCA2", RegexOptions.CultureInvariant, 0, 2, true, "\uD82F\uDCA2" };
@@ -381,13 +412,14 @@ public void Match_Timeout_Throws()
// On 32-bit we can't test these high inputs as they cause OutOfMemoryExceptions.
[ConditionalTheory(typeof(Environment), nameof(Environment.Is64BitProcess))]
- [InlineData(RegexOptions.Compiled)]
- [InlineData(RegexOptions.None)]
- public void Match_Timeout_Loop_Throws(RegexOptions options)
+ [InlineData(@"a\s+", RegexOptions.None)]
+ [InlineData(@"a\s+", RegexOptions.Compiled)]
+ [InlineData(@"a\s+ ", RegexOptions.None)]
+ [InlineData(@"a\s+ ", RegexOptions.Compiled)]
+ public void Match_Timeout_Loop_Throws(string pattern, RegexOptions options)
{
- var regex = new Regex(@"a\s+", options, TimeSpan.FromSeconds(1));
- string input = @"a" + new string(' ', 800_000_000) + @"b";
-
+ var regex = new Regex(pattern, options, TimeSpan.FromSeconds(1));
+ string input = "a" + new string(' ', 800_000_000) + " ";
Assert.Throws(() => regex.Match(input));
}
@@ -400,7 +432,6 @@ public void Match_Timeout_Repetition_Throws(RegexOptions options)
int repetitionCount = 800_000_000;
var regex = new Regex(@"a\s{" + repetitionCount+ "}", options, TimeSpan.FromSeconds(1));
string input = @"a" + new string(' ', repetitionCount) + @"b";
-
Assert.Throws(() => regex.Match(input));
}
@@ -808,6 +839,7 @@ public void Match_SpecialUnicodeCharacters_Invariant()
}
[ConditionalFact(typeof(PlatformDetection), nameof(PlatformDetection.IsNotArmProcess))] // times out on ARM
+ [SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework, "Full framework needs fix for #26484")]
[SkipOnCoreClr("Long running tests: https://github.com/dotnet/coreclr/issues/18912", RuntimeStressTestModes.JitMinOpts)]
public void Match_ExcessPrefix()
{
@@ -869,5 +901,20 @@ public void IsMatch_Invalid()
Assert.Throws(() => new Regex("pattern").IsMatch("input", -1));
Assert.Throws(() => new Regex("pattern").IsMatch("input", 6));
}
+
+ [Fact]
+ public void Synchronized()
+ {
+ var m = new Regex("abc").Match("abc");
+ Assert.True(m.Success);
+ Assert.Equal("abc", m.Value);
+
+ var m2 = System.Text.RegularExpressions.Match.Synchronized(m);
+ Assert.Same(m, m2);
+ Assert.True(m2.Success);
+ Assert.Equal("abc", m2.Value);
+
+ AssertExtensions.Throws("inner", () => System.Text.RegularExpressions.Match.Synchronized(null));
+ }
}
}
diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs
index 321ffe52732cb..783c149e6acd4 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs
@@ -90,6 +90,44 @@ public static IEnumerable Matches_TestData()
}
};
+ yield return new object[]
+ {
+ @"\b\w*\b", "handling words of various lengths", RegexOptions.None,
+ new CaptureData[]
+ {
+ new CaptureData("handling", 0, 8),
+ new CaptureData("", 8, 0),
+ new CaptureData("words", 9, 5),
+ new CaptureData("", 14, 0),
+ new CaptureData("of", 15, 2),
+ new CaptureData("", 17, 0),
+ new CaptureData("various", 18, 7),
+ new CaptureData("", 25, 0),
+ new CaptureData("lengths", 26, 7),
+ new CaptureData("", 33, 0),
+ }
+ };
+
+ yield return new object[]
+ {
+ @"\b\w{2}\b", "handling words of various lengths", RegexOptions.None,
+ new CaptureData[]
+ {
+ new CaptureData("of", 15, 2),
+ }
+ };
+
+ yield return new object[]
+ {
+ @"\w{6,}", "handling words of various lengths", RegexOptions.None,
+ new CaptureData[]
+ {
+ new CaptureData("handling", 0, 8),
+ new CaptureData("various", 18, 7),
+ new CaptureData("lengths", 26, 7),
+ }
+ };
+
yield return new object[]
{
@"foo\d+", "0123456789foo4567890foo1foo 0987", RegexOptions.RightToLeft,
@@ -141,18 +179,21 @@ public static IEnumerable Matches_TestData()
}
};
- yield return new object[]
+ if (!PlatformDetection.IsFullFramework) // missing fix in https://github.com/dotnet/runtime/pull/993
{
- "[^]", "every", RegexOptions.ECMAScript,
- new CaptureData[]
+ yield return new object[]
{
- new CaptureData("e", 0, 1),
- new CaptureData("v", 1, 1),
- new CaptureData("e", 2, 1),
- new CaptureData("r", 3, 1),
- new CaptureData("y", 4, 1),
- }
- };
+ "[^]", "every", RegexOptions.ECMAScript,
+ new CaptureData[]
+ {
+ new CaptureData("e", 0, 1),
+ new CaptureData("v", 1, 1),
+ new CaptureData("e", 2, 1),
+ new CaptureData("r", 3, 1),
+ new CaptureData("y", 4, 1),
+ }
+ };
+ }
}
[Theory]
diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexCharacterSetTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexCharacterSetTests.cs
new file mode 100644
index 0000000000000..7d3896880c60b
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/tests/RegexCharacterSetTests.cs
@@ -0,0 +1,157 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Collections.Generic;
+using System.Globalization;
+using Xunit;
+using Xunit.Sdk;
+
+namespace System.Text.RegularExpressions.Tests
+{
+ public class RegexCharacterSetTests
+ {
+ [Theory]
+ [InlineData(@"a", RegexOptions.None, new[] { 'a' })]
+ [InlineData(@"a", RegexOptions.IgnoreCase, new[] { 'a', 'A' })]
+ [InlineData(@"\u00A9", RegexOptions.None, new[] { '\u00A9' })]
+ [InlineData(@"\u00A9", RegexOptions.IgnoreCase, new[] { '\u00A9' })]
+ [InlineData(@"az", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, new[] { 'a', 'A', 'z', 'Z' })]
+ [InlineData(@"azY", RegexOptions.IgnoreCase, new[] { 'a', 'A', 'z', 'Z', 'y', 'Y' })]
+ [InlineData(@"azY", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, new[] { 'a', 'A', 'z', 'Z', 'y', 'Y' })]
+ [InlineData(@"azY\u00A9", RegexOptions.IgnoreCase, new[] { 'a', 'A', 'z', 'Z', 'y', 'Y', '\u00A9' })]
+ [InlineData(@"azY\u00A9", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, new[] { 'a', 'A', 'z', 'Z', 'y', 'Y', '\u00A9' })]
+ [InlineData(@"azY\u00A9\u05D0", RegexOptions.IgnoreCase, new[] { 'a', 'A', 'z', 'Z', 'y', 'Y', '\u00A9', '\u05D0' })]
+ [InlineData(@"azY\u00A9\u05D0", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, new[] { 'a', 'A', 'z', 'Z', 'y', 'Y', '\u00A9', '\u05D0' })]
+ [InlineData(@"a ", RegexOptions.None, new[] { 'a', ' ' })]
+ [InlineData(@"a \t\r", RegexOptions.None, new[] { 'a', ' ', '\t', '\r' })]
+ [InlineData(@"aeiou", RegexOptions.None, new[] { 'a', 'e', 'i', 'o', 'u' })]
+ [InlineData(@"a-a", RegexOptions.None, new[] { 'a' })]
+ [InlineData(@"ab", RegexOptions.None, new[] { 'a', 'b' })]
+ [InlineData(@"a-b", RegexOptions.None, new[] { 'a', 'b' })]
+ [InlineData(@"abc", RegexOptions.None, new[] { 'a', 'b', 'c' })]
+ [InlineData(@"1369", RegexOptions.None, new[] { '1', '3', '6', '9' })]
+ [InlineData(@"ACEGIKMOQSUWY", RegexOptions.None, new[] { 'A', 'C', 'E', 'G', 'I', 'K', 'M', 'O', 'Q', 'S', 'U', 'W', 'Y' })]
+ [InlineData(@"abcAB", RegexOptions.None, new[] { 'A', 'B', 'a', 'b', 'c' })]
+ [InlineData(@"a-c", RegexOptions.None, new[] { 'a', 'b', 'c' })]
+ [InlineData(@"X-b", RegexOptions.None, new[] { 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b' })]
+ [InlineData(@"\u0083\u00DE-\u00E1", RegexOptions.None, new[] { '\u0083', '\u00DE', '\u00DF', '\u00E0', '\u00E1' })]
+ [InlineData(@"\u007A-\u0083\u00DE-\u00E1", RegexOptions.None, new[] { '\u007A', '\u007B', '\u007C', '\u007D', '\u007E', '\u007F', '\u0080', '\u0081', '\u0082', '\u0083', '\u00DE', '\u00DF', '\u00E0', '\u00E1' })]
+ [InlineData(@"\u05D0", RegexOptions.None, new[] { '\u05D0' })]
+ [InlineData(@"a\u05D0", RegexOptions.None, new[] { 'a', '\u05D0' })]
+ [InlineData(@"\uFFFC-\uFFFF", RegexOptions.None, new[] { '\uFFFC', '\uFFFD', '\uFFFE', '\uFFFF' })]
+ [InlineData(@"a-z-[d-w-[m-o]]", RegexOptions.None, new[] { 'a', 'b', 'c', 'm', 'n', 'n', 'o', 'x', 'y', 'z' })]
+ [InlineData(@"\p{IsBasicLatin}-[\x00-\x7F]", RegexOptions.None, new char[0])]
+ [InlineData(@"0-9-[2468]", RegexOptions.None, new[] { '0', '1', '3', '5', '7', '9' })]
+ public void SetInclusionsExpected(string set, RegexOptions options, char[] expectedIncluded)
+ {
+ ValidateSet($"[{set}]", options, new HashSet(expectedIncluded), null);
+ if (!set.Contains("["))
+ {
+ ValidateSet($"[^{set}]", options, null, new HashSet(expectedIncluded));
+ }
+ }
+
+ [Fact]
+ public void DotInclusionsExpected()
+ {
+ ValidateSet(".", RegexOptions.None, null, new HashSet() { '\n' });
+ ValidateSet(".", RegexOptions.IgnoreCase, null, new HashSet() { '\n' });
+ ValidateSet(".", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, null, new HashSet() { '\n' });
+
+ ValidateSet(".", RegexOptions.Singleline, null, new HashSet());
+ ValidateSet(".", RegexOptions.Singleline | RegexOptions.IgnoreCase, null, new HashSet());
+ ValidateSet(".", RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, null, new HashSet());
+ }
+
+ [Fact]
+ public void WhitespaceInclusionsExpected()
+ {
+ var whitespaceInclusions = ComputeIncludedSet(char.IsWhiteSpace);
+ ValidateSet(@"[\s]", RegexOptions.None, whitespaceInclusions, null);
+ ValidateSet(@"[^\s]", RegexOptions.None, null, whitespaceInclusions);
+ ValidateSet(@"[\S]", RegexOptions.None, null, whitespaceInclusions);
+ }
+
+ [Fact]
+ public void DigitInclusionsExpected()
+ {
+ var digitInclusions = ComputeIncludedSet(char.IsDigit);
+ ValidateSet(@"[\d]", RegexOptions.None, digitInclusions, null);
+ ValidateSet(@"[^\d]", RegexOptions.None, null, digitInclusions);
+ ValidateSet(@"[\D]", RegexOptions.None, null, digitInclusions);
+ }
+
+ [Theory]
+ [InlineData(@"\p{Lu}", new[] { UnicodeCategory.UppercaseLetter })]
+ [InlineData(@"\p{S}", new[] { UnicodeCategory.CurrencySymbol, UnicodeCategory.MathSymbol, UnicodeCategory.ModifierSymbol, UnicodeCategory.OtherSymbol })]
+ [InlineData(@"\p{Lu}\p{Zl}", new[] { UnicodeCategory.UppercaseLetter, UnicodeCategory.LineSeparator })]
+ [InlineData(@"\w", new[] { UnicodeCategory.LowercaseLetter, UnicodeCategory.UppercaseLetter, UnicodeCategory.TitlecaseLetter, UnicodeCategory.OtherLetter, UnicodeCategory.ModifierLetter, UnicodeCategory.NonSpacingMark, UnicodeCategory.DecimalDigitNumber, UnicodeCategory.ConnectorPunctuation })]
+
+ public void UnicodeCategoryInclusionsExpected(string set, UnicodeCategory[] categories)
+ {
+ var categoryInclusions = ComputeIncludedSet(c => Array.IndexOf(categories, char.GetUnicodeCategory(c)) >= 0);
+ ValidateSet($"[{set}]", RegexOptions.None, categoryInclusions, null);
+ ValidateSet($"[^{set}]", RegexOptions.None, null, categoryInclusions);
+ }
+
+ [Theory]
+ [InlineData(@"\p{IsGreek}", new[] { 0x0370, 0x03FF })]
+ [InlineData(@"\p{IsRunic}\p{IsHebrew}", new[] { 0x0590, 0x05FF, 0x16A0, 0x16FF })]
+ [InlineData(@"abx-z\p{IsRunic}\p{IsHebrew}", new[] { 0x0590, 0x05FF, 0x16A0, 0x16FF, 'a', 'a', 'b', 'b', 'x', 'x', 'y', 'z' })]
+ public void NamedBlocksInclusionsExpected(string set, int[] ranges)
+ {
+ var included = new HashSet();
+ for (int i = 0; i < ranges.Length - 1; i += 2)
+ {
+ ComputeIncludedSet(c => c >= ranges[i] && c <= ranges[i + 1], included);
+ }
+
+ ValidateSet($"[{set}]", RegexOptions.None, included, null);
+ ValidateSet($"[^{set}]", RegexOptions.None, null, included);
+ }
+
+ private static HashSet ComputeIncludedSet(Func func)
+ {
+ var included = new HashSet();
+ ComputeIncludedSet(func, included);
+ return included;
+ }
+
+ private static void ComputeIncludedSet(Func func, HashSet included)
+ {
+ for (int i = 0; i <= char.MaxValue; i++)
+ {
+ if (func((char)i))
+ {
+ included.Add((char)i);
+ }
+ }
+ }
+
+ [Fact]
+ public void ValidateValidateSet()
+ {
+ Assert.Throws(() => ValidateSet("[a]", RegexOptions.None, new HashSet() { 'b' }, null));
+ Assert.Throws(() => ValidateSet("[b]", RegexOptions.None, null, new HashSet() { 'b' }));
+ }
+
+ private static void ValidateSet(string regex, RegexOptions options, HashSet included, HashSet excluded)
+ {
+ Assert.True((included != null) ^ (excluded != null));
+ foreach (RegexOptions compiled in new[] { RegexOptions.None, RegexOptions.Compiled })
+ {
+ var r = new Regex(regex, options | compiled);
+ for (int i = 0; i <= char.MaxValue; i++)
+ {
+ bool actual = r.IsMatch(((char)i).ToString());
+ bool expected = included != null ? included.Contains((char)i) : !excluded.Contains((char)i);
+ if (actual != expected)
+ {
+ throw new XunitException($"Set=\"{regex}\", Options=\"{options}\", {i.ToString("X4")} => '{(char)i}' returned {actual}");
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexCompilationInfoTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexCompilationInfoTests.cs
index 45832389ee0e9..53d7e13fcd32d 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/RegexCompilationInfoTests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/RegexCompilationInfoTests.cs
@@ -24,7 +24,17 @@ public static IEnumerable Ctor_MemberData()
[MemberData(nameof(Ctor_MemberData))]
public void Ctor_ValidArguments_CheckProperties(string pattern, RegexOptions options, string name, string fullnamespace, bool ispublic, TimeSpan matchTimeout)
{
- var regexCompilationInfo = new RegexCompilationInfo(pattern, options, name, fullnamespace, ispublic, matchTimeout);
+ RegexCompilationInfo regexCompilationInfo;
+
+ regexCompilationInfo = new RegexCompilationInfo(pattern, options, name, fullnamespace, ispublic);
+ Assert.Equal(pattern, regexCompilationInfo.Pattern);
+ Assert.Equal(options, regexCompilationInfo.Options);
+ Assert.Equal(name, regexCompilationInfo.Name);
+ Assert.Equal(fullnamespace, regexCompilationInfo.Namespace);
+ Assert.Equal(ispublic, regexCompilationInfo.IsPublic);
+ Assert.Equal(Regex.InfiniteMatchTimeout, regexCompilationInfo.MatchTimeout);
+
+ regexCompilationInfo = new RegexCompilationInfo(pattern, options, name, fullnamespace, ispublic, matchTimeout);
Assert.Equal(pattern, regexCompilationInfo.Pattern);
Assert.Equal(options, regexCompilationInfo.Options);
Assert.Equal(name, regexCompilationInfo.Name);
diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexMatchTimeoutExceptionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexMatchTimeoutExceptionTests.cs
new file mode 100644
index 0000000000000..606580344eb63
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/tests/RegexMatchTimeoutExceptionTests.cs
@@ -0,0 +1,66 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.IO;
+using System.Runtime.Serialization.Formatters.Binary;
+using Xunit;
+
+namespace System.Text.RegularExpressions.Tests
+{
+ public class RegexMatchTimeoutExceptionTests
+ {
+ [Fact]
+ public void Ctor()
+ {
+ RegexMatchTimeoutException e;
+
+ e = new RegexMatchTimeoutException();
+ Assert.Empty(e.Input);
+ Assert.Empty(e.Pattern);
+ Assert.Equal(TimeSpan.FromTicks(-1), e.MatchTimeout);
+
+ const string Message = "some message";
+ e = new RegexMatchTimeoutException(Message);
+ Assert.Equal(Message, e.Message);
+ Assert.Empty(e.Input);
+ Assert.Empty(e.Pattern);
+ Assert.Equal(TimeSpan.FromTicks(-1), e.MatchTimeout);
+
+ var inner = new FormatException();
+ e = new RegexMatchTimeoutException(Message, inner);
+ Assert.Equal(Message, e.Message);
+ Assert.Same(inner, e.InnerException);
+ Assert.Empty(e.Input);
+ Assert.Empty(e.Pattern);
+ Assert.Equal(TimeSpan.FromTicks(-1), e.MatchTimeout);
+
+ const string Input = "abcdef";
+ const string Pattern = "(?:abcdef)*";
+ TimeSpan timeout = TimeSpan.FromSeconds(42);
+ e = new RegexMatchTimeoutException(Input, Pattern, timeout);
+ Assert.Equal(Input, e.Input);
+ Assert.Equal(Pattern, e.Pattern);
+ Assert.Equal(timeout, e.MatchTimeout);
+ }
+
+ [Fact]
+ public void SerializationRoundtrip()
+ {
+ const string Input = "abcdef";
+ const string Pattern = "(?:abcdef)*";
+ TimeSpan timeout = TimeSpan.FromSeconds(42);
+ var e = new RegexMatchTimeoutException(Input, Pattern, timeout);
+
+ var bf = new BinaryFormatter();
+ var s = new MemoryStream();
+ bf.Serialize(s, e);
+ s.Position = 0;
+ e = (RegexMatchTimeoutException)bf.Deserialize(s);
+
+ Assert.Equal(Input, e.Input);
+ Assert.Equal(Pattern, e.Pattern);
+ Assert.Equal(timeout, e.MatchTimeout);
+ }
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexParserTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexParserTests.cs
index 61003d2a5d83c..b40e5496d7bbc 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/RegexParserTests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/RegexParserTests.cs
@@ -2,7 +2,9 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
+using System.IO;
using System.Reflection;
+using System.Runtime.Serialization.Formatters.Binary;
using Xunit;
using Xunit.Sdk;
@@ -15,8 +17,11 @@ public class RegexParserTests
static RegexParserTests()
{
- s_parseExceptionType = typeof(Regex).Assembly.GetType("System.Text.RegularExpressions.RegexParseException", true);
- s_parseErrorField = s_parseExceptionType.GetField("_error", BindingFlags.NonPublic | BindingFlags.Instance);
+ if (!PlatformDetection.IsFullFramework)
+ {
+ s_parseExceptionType = typeof(Regex).Assembly.GetType("System.Text.RegularExpressions.RegexParseException", true);
+ s_parseErrorField = s_parseExceptionType.GetField("_error", BindingFlags.NonPublic | BindingFlags.Instance);
+ }
}
[Theory]
@@ -800,11 +805,27 @@ public void Parse(string pattern, RegexOptions options, object errorObj)
[InlineData("a{0,2147483648}", RegexOptions.None, RegexParseError.CaptureGroupOutOfRange)]
// Surrogate pair which is parsed as [char,char-char,char] as we operate on UTF-16 code units.
[InlineData("[\uD82F\uDCA0-\uD82F\uDCA3]", RegexOptions.IgnoreCase, RegexParseError.ReversedCharRange)]
+ [SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework)]
public void Parse_NotNetFramework(string pattern, RegexOptions options, object error)
{
Parse(pattern, options, error);
}
+ [Fact]
+ [SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework)]
+ public void RegexParseException_Serializes()
+ {
+ ArgumentException e = Assert.ThrowsAny(() => new Regex("(abc|def"));
+
+ var bf = new BinaryFormatter();
+ var s = new MemoryStream();
+ bf.Serialize(s, e);
+ s.Position = 0;
+
+ ArgumentException e2 = (ArgumentException)bf.Deserialize(s);
+ Assert.Equal(e.Message, e2.Message);
+ }
+
private static void ParseSubTrees(string pattern, RegexOptions options)
{
// Trim the input from the right and make sure tree invariants hold
@@ -858,11 +879,19 @@ private static void ParseSubTree(string pattern, RegexOptions options)
/// The action to invoke.
private static void Throws(RegexParseError error, Action action)
{
+ // If no specific error is supplied, or we are running on full framework where RegexParseException
+ // we expect an ArgumentException.
+ if (PlatformDetection.IsFullFramework)
+ {
+ Assert.ThrowsAny(action);
+ return;
+ }
+
try
{
action();
}
- catch (Exception e)
+ catch (ArgumentException e)
{
// We use reflection to check if the exception is an internal RegexParseException
// and extract its error property and compare with the given one.
diff --git a/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj b/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj
index bf60979b1b5d9..e3f8c12581c94 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj
+++ b/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj
@@ -10,19 +10,23 @@
+
+
-
-
-
+
+
+
-
+
+
+
System\Text\RegularExpressions\RegexParseError.cs