Skip to content

Commit

Permalink
[arm64] Accelerate HexConverter::EncodeToUtf16 (dotnet#67192)
Browse files Browse the repository at this point in the history
  • Loading branch information
EgorBo authored Apr 16, 2022
1 parent 2d4f2d0 commit 2c3e1c7
Showing 1 changed file with 52 additions and 24 deletions.
76 changes: 52 additions & 24 deletions src/libraries/Common/src/System/HexConverter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#if SYSTEM_PRIVATE_CORELIB
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
#endif

Expand Down Expand Up @@ -89,11 +90,8 @@ public static void ToCharsBuffer(byte value, Span<char> buffer, int startingInde
}

#if SYSTEM_PRIVATE_CORELIB
private static void EncodeToUtf16_Ssse3(ReadOnlySpan<byte> bytes, Span<char> chars, Casing casing)
private static void EncodeToUtf16_Vector128(ReadOnlySpan<byte> bytes, Span<char> chars, Casing casing)
{
Debug.Assert(bytes.Length >= 4);
nint pos = 0;

Vector128<byte> shuffleMask = Vector128.Create(
0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF, 1, 0xFF,
0xFF, 0xFF, 2, 0xFF, 0xFF, 0xFF, 3, 0xFF);
Expand All @@ -108,40 +106,70 @@ private static void EncodeToUtf16_Ssse3(ReadOnlySpan<byte> bytes, Span<char> cha
(byte)'8', (byte)'9', (byte)'a', (byte)'b',
(byte)'c', (byte)'d', (byte)'e', (byte)'f');

nuint pos = 0;
Debug.Assert(bytes.Length >= 4);

// it's used to ensure we can process the trailing elements in the same SIMD loop (with possible overlap)
// but we won't double compute for any evenly divisible by 4 length since we
// compare pos > lengthSubVector128 rather than pos >= lengthSubVector128
nuint lengthSubVector128 = (nuint)bytes.Length - (nuint)Vector128<int>.Count;
ref byte destRef = ref Unsafe.As<char, byte>(ref MemoryMarshal.GetReference(chars));
do
{
// Read 32bits from "bytes" span at "pos" offset
uint block = Unsafe.ReadUnaligned<uint>(
ref Unsafe.Add(ref MemoryMarshal.GetReference(bytes), pos));

// TODO: Remove once cross-platform Shuffle is landed
// https://github.com/dotnet/runtime/issues/63331
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static Vector128<byte> Shuffle(Vector128<byte> value, Vector128<byte> mask)
{
if (Ssse3.IsSupported)
{
return Ssse3.Shuffle(value, mask);
}
else if (!AdvSimd.Arm64.IsSupported)
{
ThrowHelper.ThrowNotSupportedException();
}
return AdvSimd.Arm64.VectorTableLookup(value, mask);
}

// Calculate nibbles
Vector128<byte> lowNibbles = Ssse3.Shuffle(
Vector128<byte> lowNibbles = Shuffle(
Vector128.CreateScalarUnsafe(block).AsByte(), shuffleMask);
Vector128<byte> highNibbles = Sse2.ShiftRightLogical(
Sse2.ShiftRightLogical128BitLane(lowNibbles, 2).AsInt32(), 4).AsByte();

// ExtractVector128 is not entirely the same as ShiftRightLogical128BitLane, but it works here since
// first two bytes in lowNibbles are guaranteed to be zeros
Vector128<byte> shifted = Sse2.IsSupported ?
Sse2.ShiftRightLogical128BitLane(lowNibbles, 2) :
AdvSimd.ExtractVector128(lowNibbles, lowNibbles, 2);

Vector128<byte> highNibbles = Vector128.ShiftRightLogical(shifted.AsInt32(), 4).AsByte();

// Lookup the hex values at the positions of the indices
Vector128<byte> indices = Sse2.And(
Sse2.Or(lowNibbles, highNibbles), Vector128.Create((byte)0xF));
Vector128<byte> hex = Ssse3.Shuffle(asciiTable, indices);
Vector128<byte> indices = (lowNibbles | highNibbles) & Vector128.Create((byte)0xF);
Vector128<byte> hex = Shuffle(asciiTable, indices);

// The high bytes (0x00) of the chars have also been converted
// to ascii hex '0', so clear them out.
hex = Sse2.And(hex, Vector128.Create((ushort)0xFF).AsByte());
hex &= Vector128.Create((ushort)0xFF).AsByte();
hex.StoreUnsafe(ref destRef, pos * 4); // we encode 4 bytes as a single char (0x0-0xF)
pos += (nuint)Vector128<int>.Count;

// Save to "chars" at pos*2 offset
Unsafe.WriteUnaligned(
ref Unsafe.As<char, byte>(
ref Unsafe.Add(ref MemoryMarshal.GetReference(chars), pos * 2)), hex);
if (pos == (nuint)bytes.Length)
{
return;
}

pos += 4;
} while (pos < bytes.Length - 3);
// Overlap with the current chunk for trailing elements
if (pos > lengthSubVector128)
{
pos = lengthSubVector128;
}

// Process trailing elements (bytes.Length % 4)
for (; pos < bytes.Length; pos++)
{
ToCharsBuffer(Unsafe.Add(ref MemoryMarshal.GetReference(bytes), pos), chars, (int)pos * 2, casing);
}
} while (true);
}
#endif

Expand All @@ -150,9 +178,9 @@ public static void EncodeToUtf16(ReadOnlySpan<byte> bytes, Span<char> chars, Cas
Debug.Assert(chars.Length >= bytes.Length * 2);

#if SYSTEM_PRIVATE_CORELIB
if (Ssse3.IsSupported && bytes.Length >= 4)
if ((AdvSimd.Arm64.IsSupported || Ssse3.IsSupported) && bytes.Length >= 4)
{
EncodeToUtf16_Ssse3(bytes, chars, casing);
EncodeToUtf16_Vector128(bytes, chars, casing);
return;
}
#endif
Expand Down

0 comments on commit 2c3e1c7

Please sign in to comment.