Skip to content

Commit

Permalink
Merge pull request dotnet#1348 from stephentoub/moreregexperf
Browse files Browse the repository at this point in the history
More Regex perf improvements
  • Loading branch information
stephentoub authored Jan 9, 2020
2 parents 72b871d + 95965c9 commit 0434b87
Show file tree
Hide file tree
Showing 31 changed files with 4,507 additions and 719 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
<PropertyGroup>
<AssemblyName>System.Text.RegularExpressions</AssemblyName>
<DefineConstants>$(DefineConstants);FEATURE_COMPILED</DefineConstants>
<DefineConstants>$(DefineConstants);FEATURE_COMPILEAPIS</DefineConstants>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
<Configurations>$(NetCoreAppCurrent)-Debug;$(NetCoreAppCurrent)-Release</Configurations>
<Nullable>enable</Nullable>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,32 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

// This is the only concrete implementation of RegexRunnerFactory,
// but we cannot combine them due to RegexRunnerFactory having shipped public.
using System.Reflection.Emit;

namespace System.Text.RegularExpressions
{
internal sealed class CompiledRegexRunnerFactory : RegexRunnerFactory
{
private readonly Action<RegexRunner> _go;
private readonly Func<RegexRunner, bool> _findFirstChar;
private readonly Action<RegexRunner> _initTrackCount;
private readonly DynamicMethod _goMethod;
private readonly DynamicMethod _findFirstCharMethod;
private readonly DynamicMethod _initTrackCountMethod;

public CompiledRegexRunnerFactory(Action<RegexRunner> go, Func<RegexRunner, bool> findFirstChar, Action<RegexRunner> initTrackCount)
// Delegates are lazily created to avoid forcing JIT'ing until the regex is actually executed.
private Action<RegexRunner>? _go;
private Func<RegexRunner, bool>? _findFirstChar;
private Action<RegexRunner>? _initTrackCount;

public CompiledRegexRunnerFactory(DynamicMethod goMethod, DynamicMethod findFirstCharMethod, DynamicMethod initTrackCountMethod)
{
_go = go;
_findFirstChar = findFirstChar;
_initTrackCount = initTrackCount;
_goMethod = goMethod;
_findFirstCharMethod = findFirstCharMethod;
_initTrackCountMethod = initTrackCountMethod;
}

protected internal override RegexRunner CreateInstance() => new CompiledRegexRunner(_go, _findFirstChar, _initTrackCount);
protected internal override RegexRunner CreateInstance() =>
new CompiledRegexRunner(
_go ??= (Action<RegexRunner>)_goMethod.CreateDelegate(typeof(Action<RegexRunner>)),
_findFirstChar ??= (Func<RegexRunner, bool>)_findFirstCharMethod.CreateDelegate(typeof(Func<RegexRunner, bool>)),
_initTrackCount ??= (Action<RegexRunner>)_initTrackCountMethod.CreateDelegate(typeof(Action<RegexRunner>)));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
//

using System.Collections;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;

namespace System.Text.RegularExpressions
Expand Down Expand Up @@ -331,6 +332,7 @@ internal void Tidy(int textpos)
}

#if DEBUG
[ExcludeFromCodeCoverage]
internal bool Debug => _regex != null && _regex.Debug;

internal virtual void Dump()
Expand Down Expand Up @@ -372,6 +374,7 @@ internal MatchSparse(Regex regex, Hashtable caps, int capcount, string text, int
public override GroupCollection Groups => _groupcoll ??= new GroupCollection(this, _caps);

#if DEBUG
[ExcludeFromCodeCoverage]
internal override void Dump()
{
if (_caps != null)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,6 @@ private static RegexRunnerFactory Compile(RegexCode code, RegexOptions options,
}
#endif

#if FEATURE_COMPILEAPIS
public static void CompileToAssembly(RegexCompilationInfo[] regexinfos, AssemblyName assemblyname)
{
throw new PlatformNotSupportedException(SR.PlatformNotSupported_CompileToAssembly);
Expand All @@ -222,7 +221,6 @@ public static void CompileToAssembly(RegexCompilationInfo[] regexinfos, Assembly
{
throw new PlatformNotSupportedException(SR.PlatformNotSupported_CompileToAssembly);
}
#endif // FEATURE_COMPILEAPIS

/// <summary>
/// Escapes a minimal set of metacharacters (\, *, +, ?, |, {, [, (, ), ^, $, ., #, and
Expand Down Expand Up @@ -461,6 +459,7 @@ protected void InitializeReferences()
/// <summary>
/// True if the regex has debugging enabled
/// </summary>
[ExcludeFromCodeCoverage]
internal bool Debug => (roptions & RegexOptions.Debug) != 0;
#endif
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
// need to be examined.

using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;

namespace System.Text.RegularExpressions
Expand Down Expand Up @@ -338,8 +339,10 @@ public int Scan(string text, int index, int beglimit, int endlimit)
/// <summary>
/// Used when dumping for debugging.
/// </summary>
[ExcludeFromCodeCoverage]
public override string ToString() => Pattern;

[ExcludeFromCodeCoverage]
public string Dump(string indent)
{
StringBuilder sb = new StringBuilder();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.Threading;

Expand Down Expand Up @@ -733,7 +734,7 @@ public static string ConvertOldStringsToClass(string set, string category)
/// </summary>
public static char SingletonChar(string set)
{
Debug.Assert(IsSingletonInverse(set), "Tried to get the singleton char out of a non singleton character class");
Debug.Assert(IsSingleton(set) || IsSingletonInverse(set), "Tried to get the singleton char out of a non singleton character class");
return set[SetStartIndex];
}

Expand All @@ -748,13 +749,138 @@ public static bool IsEmpty(string charClass) =>
!IsNegated(charClass) &&
!IsSubtraction(charClass);

/// <summary><c>true</c> if the set contains a single character only</summary>
/// <remarks>
/// This will happen not only from character classes manually written to contain a single character,
/// but much more frequently by the implementation/parser itself, e.g. when looking for \n as part of
/// finding the end of a line, when processing an alternation like "hello|hithere" where the first
/// character of both options is the same, etc.
/// </remarks>
public static bool IsSingleton(string set) =>
set[CategoryLengthIndex] == 0 &&
set[SetLengthIndex] == 2 &&
!IsNegated(set) &&
!IsSubtraction(set) &&
(set[SetStartIndex] == LastChar || set[SetStartIndex] + 1 == set[SetStartIndex + 1]);

public static bool IsSingletonInverse(string set) =>
set[CategoryLengthIndex] == 0 &&
set[SetLengthIndex] == 2 &&
IsNegated(set) &&
!IsSubtraction(set) &&
(set[SetStartIndex] == LastChar || set[SetStartIndex] + 1 == set[SetStartIndex + 1]);

/// <summary>Gets all of the characters in the specified set, storing them into the provided span.</summary>
/// <param name="set">The character class.</param>
/// <param name="chars">The span into which the chars should be stored.</param>
/// <returns>
/// The number of stored chars. If they won't all fit, 0 is returned.
/// </returns>
/// <remarks>
/// Only considers character classes that only contain sets (no categories), no negation,
/// and no subtraction... just simple sets containing starting/ending pairs.
/// </remarks>
public static int GetSetChars(string set, Span<char> chars)
{
if (!CanEasilyEnumerateSetContents(set))
{
return 0;
}

int setLength = set[SetLengthIndex];
int count = 0;
for (int i = SetStartIndex; i < SetStartIndex + setLength; i += 2)
{
int curSetEnd = set[i + 1];
for (int c = set[i]; c < curSetEnd; c++)
{
if (count >= chars.Length)
{
return 0;
}

chars[count++] = (char)c;
}
}

return count;
}

/// <summary>
/// Determines whether two sets may overlap.
/// </summary>
/// <returns>false if the two sets do not overlap; true if they may.</returns>
/// <remarks>
/// If the method returns false, the caller can be sure the sets do not overlap.
/// If the method returns true, it's still possible the sets don't overlap.
/// </remarks>
public static bool MayOverlap(string set1, string set2)
{
// If either set is all-inclusive, there's overlap.
if (set1 == AnyClass || set2 == AnyClass)
{
return true;
}

// If the sets are identical other than one being the negation of the other, they don't overlap.
if (IsNegated(set1) != IsNegated(set2) && set1.AsSpan(1).SequenceEqual(set2.AsSpan(1)))
{
return false;
}

// Special-case some known, common classes that don't overlap.
if (KnownDistinctSets(set1, set2) ||
KnownDistinctSets(set2, set1))
{
return false;
}

// If set2 can be easily enumerated (e.g. no unicode categories), then enumerate it and
// check if any of its members are in set1. Otherwise, the same for set1.
if (CanEasilyEnumerateSetContents(set2))
{
return MayOverlapByEnumeration(set1, set2);
}
else if (CanEasilyEnumerateSetContents(set1))
{
return MayOverlapByEnumeration(set2, set1);
}

// Assume that everything else might overlap. In the future if it proved impactful, we could be more accurate here,
// at the exense of more computation time.
return true;

static bool KnownDistinctSets(string set1, string set2) =>
(set1 == SpaceClass || set1 == ECMASpaceClass) &&
(set2 == DigitClass || set2 == WordClass || set2 == ECMADigitClass || set2 == ECMAWordClass);

static bool MayOverlapByEnumeration(string set1, string set2)
{
for (int i = SetStartIndex; i < SetStartIndex + set2[SetLengthIndex]; i += 2)
{
int curSetEnd = set2[i + 1];
for (int c = set2[i]; c < curSetEnd; c++)
{
if (CharInClass((char)c, set1))
{
return true;
}
}
}

return false;
}
}

/// <summary>Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents.</summary>
internal static bool CanEasilyEnumerateSetContents(string set) =>
set.Length > SetStartIndex &&
set[SetLengthIndex] > 0 &&
set[SetLengthIndex] % 2 == 0 &&
set[CategoryLengthIndex] == 0 &&
!IsNegated(set) &&
!IsSubtraction(set);

internal static bool IsSubtraction(string charClass) =>
charClass.Length > SetStartIndex +
charClass[CategoryLengthIndex] +
Expand Down Expand Up @@ -1249,6 +1375,7 @@ private static ReadOnlySpan<char> SetFromProperty(string capname, bool invert, s
/// <summary>
/// Produces a human-readable description for a set string.
/// </summary>
[ExcludeFromCodeCoverage]
public static string SetDescription(string set)
{
int setLength = set[SetLengthIndex];
Expand Down Expand Up @@ -1347,6 +1474,7 @@ public static string SetDescription(string set)
/// <summary>
/// Produces a human-readable description for a single character.
/// </summary>
[ExcludeFromCodeCoverage]
public static string CharDescription(char ch)
{
if (ch == '\\')
Expand Down Expand Up @@ -1382,6 +1510,7 @@ public static string CharDescription(char ch)
return sb.ToString();
}

[ExcludeFromCodeCoverage]
private static string CategoryDescription(char ch)
{
if (ch == SpaceConst)
Expand Down
Loading

0 comments on commit 0434b87

Please sign in to comment.