Optimize WithUpper/WithLower with InsertSelectedScalar, SpanHelpers.S…

…equence APIs (dotnet#38075) * Add a note for SpanHelpers.SequenceCompareTo() and SequenceEqual() method * Use InsertSelectedScalar for WithUpper() and WithLower()
just607 · Jun 24, 2020 · bf8aba0 · bf8aba0
1 parent 313b165
commit bf8aba0
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 10 deletions.
diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
@@ -1682,12 +1682,9 @@ public static Vector64<T> GetLower<T>(this Vector128<T> vector)
         public static Vector128<T> WithLower<T>(this Vector128<T> vector, Vector64<T> value)
             where T : struct
         {
-            if (AdvSimd.IsSupported)
+            if (AdvSimd.Arm64.IsSupported)
             {
-                // Note: The 3rd operand GetElement() should be the argument to Insert(). Storing the
-                // result of GetElement() in a local variable and then passing local variable to Insert()
-                // would not merge insert/getelement in a single instruction.
-                return AdvSimd.Insert(vector.AsUInt64(), 0, value.AsUInt64().GetElement(0)).As<ulong, T>();
+                return AdvSimd.Arm64.InsertSelectedScalar(vector.AsUInt64(), 0, value.ToVector128Unsafe().AsUInt64(), 0).As<ulong, T>();
             }
 
             return SoftwareFallback(vector, value);
@@ -1727,12 +1724,9 @@ public static Vector64<T> GetUpper<T>(this Vector128<T> vector)
         public static Vector128<T> WithUpper<T>(this Vector128<T> vector, Vector64<T> value)
             where T : struct
         {
-            if (AdvSimd.IsSupported)
+            if (AdvSimd.Arm64.IsSupported)
             {
-                // Note: The 3rd operand GetElement() should be the argument to Insert(). Storing the
-                // result of GetElement() in a local variable and then passing local variable to Insert()
-                // would not merge insert/getelement in a single instruction.
-                return AdvSimd.Insert(vector.AsUInt64(), 1, value.AsUInt64().GetElement(0)).As<ulong, T>();
+                return AdvSimd.Arm64.InsertSelectedScalar(vector.AsUInt64(), 1, value.ToVector128Unsafe().AsUInt64(), 0).As<ulong, T>();
             }
 
             return SoftwareFallback(vector, value);

diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs
@@ -1552,6 +1552,13 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
                     goto NotEqual;
                 }
             }
+            //else if (AdvSimd.Arm64.IsSupported)
+            //{
+            //    // This API is not optimized with ARM64 intrinsics because there is not much performance win seen
+            //    // when compared to the vectorized implementation below. In addition to comparing the bytes in chunks of
+            //    // 16-bytes, the only check that is done is if there is a mismatch and if yes, return false. This check
+            //    // done with Vector<T> will generate same code by JIT as that if used ARM64 intrinsic instead.
+            //}
             else if (Vector.IsHardwareAccelerated && length >= (nuint)Vector<byte>.Count)
             {
                 nuint offset = 0;
@@ -1787,6 +1794,15 @@ public static unsafe int SequenceCompareTo(ref byte first, int firstLength, ref
                     return result;
                 }
             }
+            //else if (AdvSimd.Arm64.IsSupported)
+            //{
+            //    // This API is not optimized with ARM64 intrinsics because there is not much performance win seen
+            //    // when compared to the vectorized implementation below. There were some wins if the mismatch happen
+            //    // after 8th index of the chunk because with ARM64 intrinsic, using fewer instructions the first mismatched
+            //    // index can be retrieved. In case of vectorization, sequential scan has to be done instead. However, at the
+            //    // same time, there are losses if the mismatch index is less than 7~8. So the overall benefit doesn't justify
+            //    // to optimize this method with ARM64 hardware intrinsics.
+            //}
             else if (Vector.IsHardwareAccelerated)
             {
                 if (lengthToExamine > (nuint)Vector<byte>.Count)