Skip to content

Commit

Permalink
Use IndexOf(..., span) in Regex FindFirstChar (dotnet#60888)
Browse files Browse the repository at this point in the history
* Use IndexOf(..., span) in Regex FindFirstChar

We currently use Boyer-Moore to find a multi-character prefix in FindFirstChar.  That's good, and when it was originally added it was very likely better than IndexOf(..., string), but since then the latter has been vectorized and its throughput has improved significantly (we also plan to improve it further).  While there are situations where Boyer-Moore does better, for general text searching and typical cases, IndexOf wins.  So, this commit adds the ability for FindFirstChar to use IndexOf to search for a span, and prefers that over Boyer-Moore when it's applicable, namely when we're case-sensitive and left-to-right processing.

* Centralize check for whether IndexOf is supported
  • Loading branch information
stephentoub authored Oct 27, 2021
1 parent 3ceff6d commit 034d5e2
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -269,9 +269,16 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
{
EmitAnchors();

if (code.BoyerMoorePrefix is RegexBoyerMoore { NegativeUnicode: null })
if (code.BoyerMoorePrefix is RegexBoyerMoore { NegativeUnicode: null } rbm)
{
EmitBoyerMoore();
if (rbm.PatternSupportsIndexOf)
{
EmitIndexOf(rbm.Pattern);
}
else
{
EmitBoyerMoore(rbm);
}
}
else if (lcc is not null)
{
Expand Down Expand Up @@ -408,13 +415,8 @@ void EmitAnchors()
}
}

void EmitBoyerMoore()
void EmitBoyerMoore(RegexBoyerMoore rbm)
{
RegexBoyerMoore? rbm = code.BoyerMoorePrefix;
Debug.Assert(rbm is RegexBoyerMoore { NegativeUnicode: null });

writer.WriteLine("// Boyer-Moore prefix matching");

EmitTextInfoIfRequired(writer, ref textInfoEmitted, ref hasTextInfo, rm);

int beforefirst;
Expand Down Expand Up @@ -535,6 +537,16 @@ void EmitBoyerMoore()
}
}

void EmitIndexOf(string prefix)
{
writer.WriteLine($"int i = global::System.MemoryExtensions.IndexOf(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos), {Literal(prefix)});");
writer.WriteLine("if (i >= 0)");
writer.WriteLine("{");
writer.WriteLine(" base.runtextpos = runtextpos + i;");
writer.WriteLine(" return true;");
writer.WriteLine("}");
}

void EmitLeadingCharacter_RightToLeft()
{
EmitTextInfoIfRequired(writer, ref textInfoEmitted, ref hasTextInfo, rm);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,14 @@ public RegexBoyerMoore(string pattern, bool caseInsensitive, bool rightToLeft, C
}
}

// TODO: We should be able to avoid producing the RegexBoyerMoore instance
// entirely if we're going to go down the code path of using IndexOf. That will
// require some refactoring, though.

/// <summary>Gets whether IndexOf could be used to perform the match.</summary>
public bool PatternSupportsIndexOf =>
!RightToLeft && (!CaseInsensitive || !RegexCharClass.ParticipatesInCaseConversion(Pattern));

/// <summary>
/// When a regex is anchored, we can do a quick IsMatch test instead of a Scan
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -874,6 +874,21 @@ public static bool ParticipatesInCaseConversion(int comparison)
}
}

/// <summary>Gets whether the specified string participates in case conversion.</summary>
/// <remarks>The string participates in case conversion if any of its characters do.</remarks>
public static bool ParticipatesInCaseConversion(string s)
{
foreach (char c in s)
{
if (ParticipatesInCaseConversion(c))
{
return true;
}
}

return false;
}

/// <summary>Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents.</summary>
private static bool CanEasilyEnumerateSetContents(string set) =>
set.Length > SetStartIndex &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ internal abstract class RegexCompiler
private static readonly MethodInfo s_spanGetItemMethod = typeof(ReadOnlySpan<char>).GetMethod("get_Item", new Type[] { typeof(int) })!;
private static readonly MethodInfo s_spanGetLengthMethod = typeof(ReadOnlySpan<char>).GetMethod("get_Length")!;
private static readonly MethodInfo s_memoryMarshalGetReference = typeof(MemoryMarshal).GetMethod("GetReference", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOf = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfChar = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfSpan = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan<char>).GetMethod("Slice", new Type[] { typeof(int) })!;
Expand Down Expand Up @@ -1013,9 +1014,16 @@ protected void GenerateFindFirstChar()

GenerateAnchorChecks();

if (_boyerMoorePrefix is RegexBoyerMoore { NegativeUnicode: null })
if (_boyerMoorePrefix is RegexBoyerMoore { NegativeUnicode: null } rbm)
{
GenerateBoyerMoore();
if (rbm.PatternSupportsIndexOf)
{
GenerateIndexOf(rbm.Pattern);
}
else
{
GenerateBoyerMoore(rbm);
}
}
else if (_leadingCharClasses is not null)
{
Expand Down Expand Up @@ -1212,12 +1220,8 @@ void GenerateAnchorChecks()
}
}

void GenerateBoyerMoore()
void GenerateBoyerMoore(RegexBoyerMoore rbm)
{
RegexBoyerMoore? rbm = _boyerMoorePrefix;
Debug.Assert(rbm is RegexBoyerMoore { NegativeUnicode: null });

// Compiled Boyer-Moore string matching
LocalBuilder limitLocal;
int beforefirst;
int last;
Expand Down Expand Up @@ -1425,6 +1429,43 @@ void GenerateBoyerMoore()
}
}

void GenerateIndexOf(string prefix)
{
using RentedLocalBuilder i = RentInt32Local();

// int i = runtext.AsSpan(runtextpos, runtextend - runtextpos).IndexOf(prefix);
Ldthis();
Ldfld(s_runtextField);
Ldloc(_runtextposLocal);
Ldloc(_runtextendLocal);
Ldloc(_runtextposLocal);
Sub();
Call(s_stringAsSpanIntIntMethod);
Ldstr(prefix);
Call(s_stringAsSpanMethod);
Call(s_spanIndexOfSpan);
Stloc(i);

// if (i < 0)
// {
// base.runtextpos = runtextend;
// return false;
// }
Ldloc(i);
Ldc(0);
BltFar(returnFalse);

// base.runtextpos = runtextpos + i;
// return true;
Ldthis();
Ldloc(_runtextposLocal);
Ldloc(i);
Add();
Stfld(s_runtextposField);
Ldc(1);
Ret();
}

void GenerateLeadingCharacter_RightToLeft()
{
Debug.Assert(_leadingCharClasses.Length == 1, "Only the FirstChars and not MultiFirstChars computation is supported for RightToLeft");
Expand Down Expand Up @@ -1585,7 +1626,7 @@ void GenerateLeadingCharacter_LeftToRight()
case 1:
// tmp = ...IndexOf(setChars[0]);
Ldc(setChars[0]);
Call(s_spanIndexOf);
Call(s_spanIndexOfChar);
break;

case 2:
Expand Down Expand Up @@ -2849,7 +2890,7 @@ void EmitSingleCharAtomicLoop(RegexNode node)
Ldloc(textSpanLocal);
}
Ldc(node.Ch);
Call(s_spanIndexOf);
Call(s_spanIndexOfChar);
Stloc(iterationLocal);

// if (i >= 0) goto atomicLoopDoneLabel;
Expand Down Expand Up @@ -4553,7 +4594,7 @@ private void GenerateOneCode()
Ldloc(lenLocal);
Call(s_stringAsSpanIntIntMethod);
Ldc(Operand(0));
Call(s_spanIndexOf);
Call(s_spanIndexOfChar);
Stloc(iLocal);

Label charFound = DefineLabel();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,11 @@ public RegexInterpreter(RegexCode code, CultureInfo culture)
(_, true) => FindFirstCharMode.LeadingAnchor_RightToLeft_EndZ,
};
}
else if (code.BoyerMoorePrefix is not null)
else if (code.BoyerMoorePrefix is RegexBoyerMoore rbm)
{
_findFirstCharMode = FindFirstCharMode.BoyerMoore;
_findFirstCharMode = rbm.PatternSupportsIndexOf ?
FindFirstCharMode.IndexOf :
FindFirstCharMode.BoyerMoore;
}
else if (code.LeadingCharClasses is not null)
{
Expand Down Expand Up @@ -382,6 +384,7 @@ private enum FindFirstCharMode
LeadingAnchor_RightToLeft_EndZ,
LeadingAnchor_RightToLeft_End,

IndexOf,
BoyerMoore,

LeadingCharClass_LeftToRight_CaseSensitive_Singleton,
Expand Down Expand Up @@ -507,7 +510,19 @@ protected override bool FindFirstChar()
}
return NoPrefixOrPrefixMatches();

// There were no anchors, but there was a Boyer-Moore prefix. Scan for it.
// There was a prefix. Scan for it.

case FindFirstCharMode.IndexOf:
{
int i = runtext.AsSpan(runtextpos, runtextend - runtextpos).IndexOf(_code.BoyerMoorePrefix!.Pattern);
if (i >= 0)
{
runtextpos += i;
return true;
}
runtextpos = runtextend;
return false;
}

case FindFirstCharMode.BoyerMoore:
runtextpos = _code.BoyerMoorePrefix!.Scan(runtext!, runtextpos, runtextbeg, runtextend);
Expand Down

0 comments on commit 034d5e2

Please sign in to comment.