Skip to content

Commit

Permalink
Optimize WithUpper/WithLower with InsertSelectedScalar, SpanHelpers.S…
Browse files Browse the repository at this point in the history
…equence APIs (dotnet#38075)

* Add a note for SpanHelpers.SequenceCompareTo() and SequenceEqual() method

* Use InsertSelectedScalar for WithUpper() and WithLower()
  • Loading branch information
kunalspathak authored Jun 24, 2020
1 parent 313b165 commit bf8aba0
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1682,12 +1682,9 @@ public static Vector64<T> GetLower<T>(this Vector128<T> vector)
public static Vector128<T> WithLower<T>(this Vector128<T> vector, Vector64<T> value)
where T : struct
{
if (AdvSimd.IsSupported)
if (AdvSimd.Arm64.IsSupported)
{
// Note: The 3rd operand GetElement() should be the argument to Insert(). Storing the
// result of GetElement() in a local variable and then passing local variable to Insert()
// would not merge insert/getelement in a single instruction.
return AdvSimd.Insert(vector.AsUInt64(), 0, value.AsUInt64().GetElement(0)).As<ulong, T>();
return AdvSimd.Arm64.InsertSelectedScalar(vector.AsUInt64(), 0, value.ToVector128Unsafe().AsUInt64(), 0).As<ulong, T>();
}

return SoftwareFallback(vector, value);
Expand Down Expand Up @@ -1727,12 +1724,9 @@ public static Vector64<T> GetUpper<T>(this Vector128<T> vector)
public static Vector128<T> WithUpper<T>(this Vector128<T> vector, Vector64<T> value)
where T : struct
{
if (AdvSimd.IsSupported)
if (AdvSimd.Arm64.IsSupported)
{
// Note: The 3rd operand GetElement() should be the argument to Insert(). Storing the
// result of GetElement() in a local variable and then passing local variable to Insert()
// would not merge insert/getelement in a single instruction.
return AdvSimd.Insert(vector.AsUInt64(), 1, value.AsUInt64().GetElement(0)).As<ulong, T>();
return AdvSimd.Arm64.InsertSelectedScalar(vector.AsUInt64(), 1, value.ToVector128Unsafe().AsUInt64(), 0).As<ulong, T>();
}

return SoftwareFallback(vector, value);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1552,6 +1552,13 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
goto NotEqual;
}
}
//else if (AdvSimd.Arm64.IsSupported)
//{
// // This API is not optimized with ARM64 intrinsics because there is not much performance win seen
// // when compared to the vectorized implementation below. In addition to comparing the bytes in chunks of
// // 16-bytes, the only check that is done is if there is a mismatch and if yes, return false. This check
// // done with Vector<T> will generate same code by JIT as that if used ARM64 intrinsic instead.
//}
else if (Vector.IsHardwareAccelerated && length >= (nuint)Vector<byte>.Count)
{
nuint offset = 0;
Expand Down Expand Up @@ -1787,6 +1794,15 @@ public static unsafe int SequenceCompareTo(ref byte first, int firstLength, ref
return result;
}
}
//else if (AdvSimd.Arm64.IsSupported)
//{
// // This API is not optimized with ARM64 intrinsics because there is not much performance win seen
// // when compared to the vectorized implementation below. There were some wins if the mismatch happen
// // after 8th index of the chunk because with ARM64 intrinsic, using fewer instructions the first mismatched
// // index can be retrieved. In case of vectorization, sequential scan has to be done instead. However, at the
// // same time, there are losses if the mismatch index is less than 7~8. So the overall benefit doesn't justify
// // to optimize this method with ARM64 hardware intrinsics.
//}
else if (Vector.IsHardwareAccelerated)
{
if (lengthToExamine > (nuint)Vector<byte>.Count)
Expand Down

0 comments on commit bf8aba0

Please sign in to comment.