-
I apologize, this is lengthy and probably overexplanative. In the process of writing my own binary endianness swapper, i used Disclaimer: this is not an argument over whether or not what I'm doing is sane and if I should even bother doing it. It's an exploration into what I can do with the recent additions to the language, as I haven't seriously touched .NET since .NET Core 3. [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe T[] ReadBE<T>(this Stream stream, int count) where T : unmanaged, IBinaryInteger<T>
{
var value = new T[count];
var valueBytes = MemoryMarshal.AsBytes(value.AsSpan());
stream.ReadExactly(valueBytes);
if (BitConverter.IsLittleEndian)
ReverseEndianness(value.AsSpan());
return value;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe uint[] ReadUInt32BE(this Stream stream, int count) => ReadBE<uint>(stream, count);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static unsafe void ReverseEndianness<T>(Span<T> value) where T : unmanaged, IBinaryInteger<T>
{
var valueSpan = MemoryMarshal.AsBytes(value);
ref T sourceRef = ref MemoryMarshal.GetReference(value);
int i = 0;
var iterationCount = value.Length - Vector128<T>.Count;
if (Vector256.IsHardwareAccelerated && i <= iterationCount)
{
while (i <= iterationCount)
{
Vector256.StoreUnsafe(Vector256.Shuffle(
Vector256.LoadUnsafe(ref sourceRef, (uint)i).AsByte(),
MakeSwizzle256Fast<T>()
).As<byte, T>(), ref sourceRef, (uint) i);
i += Vector256<T>.Count;
}
}
iterationCount = value.Length - Vector128<T>.Count;
if (Vector128.IsHardwareAccelerated && i <= iterationCount)
{
// var swizzle = MakeSwizzle128Fast<T>();
while (i <= iterationCount)
{
Vector128.StoreUnsafe(Vector128.Shuffle(
Vector128.LoadUnsafe(ref sourceRef, (uint)i).AsByte(),
MakeSwizzle128Fast<T>()
).As<byte, T>(), ref sourceRef, (uint)i);
i += Vector128<T>.Count;
}
}
// bother with Vector64?
// find less stupid solution
i *= Unsafe.SizeOf<T>();
while (i < valueSpan.Length)
{
for (int j = 0, k = Unsafe.SizeOf<T>() - 1; j < k; ++j, --k)
{
var leftIndex = i + j;
var rightIndex = i + k;
(valueSpan[rightIndex], valueSpan[leftIndex]) = (valueSpan[leftIndex], valueSpan[rightIndex]);
}
i += Unsafe.SizeOf<T>();
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining), SkipLocalsInit]
private unsafe static void MakeSwizzle<M>(ref M zero, int wordSize) where M : struct
{
// Go through pointers to eliminate bounds checks
var zeroPtr = (byte*) Unsafe.AsPointer(ref zero);
for (var i = 0; i < Unsafe.SizeOf<M>(); i += wordSize)
{
for (var j = wordSize - 1; j >= 0; --j)
zeroPtr[i + j] = (byte)(wordSize - j - 1 + i);
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<byte> MakeSwizzle256Fast<T>() where T : unmanaged, IBinaryInteger<T>
{
if (Unsafe.SizeOf<T>() == 4)
return Vector256.Create((byte)3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28);
else if (Unsafe.SizeOf<T>() == 8)
return Vector256.Create((byte)7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24);
else
{
// u16 probs faster as shr pairs but let's try shuffling
Unsafe.SkipInit(out Vector256<byte> swizzle);
MakeSwizzle(ref swizzle, Unsafe.SizeOf<T>());
return swizzle;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<byte> MakeSwizzle128Fast<T>() where T : unmanaged, IBinaryInteger<T>
{
if (Unsafe.SizeOf<T>() == 4)
return Vector128.Create((byte)3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
else if (Unsafe.SizeOf<T>() == 8)
return Vector128.Create((byte)7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
else
{
Unsafe.SkipInit(out Vector128<byte> swizzle);
MakeSwizzle(ref swizzle, Unsafe.SizeOf<T>());
return swizzle;
}
} In the process of writing this and benchmarking it I noticed the following things:
Where the benchmark is: [Benchmark(Description = "Endian swap 21 32-bits numbers")]
public uint[] BenchmarkEndianness()
{
_dataStream.Position = 0;
var value = _dataStream.ReadUInt32BE(21);
return value;
}
[Benchmark(Description = "Endian swap 21 32-bits numbers - Builtin", Baseline = true)]
public uint[] BenchmarkEndiannessBuiltin()
{
_dataStream.Position = 0;
var integers = new uint[21];
_dataStream.ReadExactly(MemoryMarshal.AsBytes(integers.AsSpan()));
BinaryPrimitives.ReverseEndianness(integers.AsSpan(), integers.AsSpan());
return integers;
} And an excerpt of the disassembly:
Meanwhile
So my question is: what am I missing? What's causing the JIT to drop the intrinsic and fallback to a software implementation? I found #102702 and associated pull requests but I don't know enough about the JIT's rules on using intrinsics to really understand what's going on, besides the fact that the JIT gives up trying to use an intrinsic and falls back to calling the software implementation. |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment 4 replies
-
This a known limitation of |
Beta Was this translation helpful? Give feedback.
This a known limitation of
Vector_.Shuffle
API. It currently expects a constant mask as a parameter directly and it should be visible for JIT early so hiding a mask under aggressiveinlining might not work as expected. Eventually, it should be improved for cases like this.