Support storing Vector4 SoA/AoS conversion

rafzi · Apr 30, 2024 · b97dab0 · b97dab0
1 parent 061af10
commit b97dab0
Show file tree

Hide file tree

Showing 3 changed files with 291 additions and 26 deletions.
diff --git a/src/libs/vcl.core/vcl/core/simd/memory_avx.h b/src/libs/vcl.core/vcl/core/simd/memory_avx.h
@@ -307,6 +307,45 @@ namespace Vcl {
 		_mm512_storeu_ps(p + 16, m4567);
 		_mm512_storeu_ps(p + 32, m89ab);
 	}
+
+	VCL_STRONG_INLINE void store(
+		Eigen::Vector4f* base,
+		const __m512& x,
+		const __m512& y,
+		const __m512& z,
+		const __m512& w)
+	{
+		const __m512 x0x2y0y2 = _mm512_shuffle_ps(x, y, _MM_SHUFFLE(2, 0, 2, 0)); // 02'46'8a'ce
+		const __m512 x1x3y1y3 = _mm512_shuffle_ps(x, y, _MM_SHUFFLE(3, 1, 3, 1)); // 13'57'9b'df
+		const __m512 z0z2w0w2 = _mm512_shuffle_ps(z, w, _MM_SHUFFLE(2, 0, 2, 0));
+		const __m512 z1z3w1w3 = _mm512_shuffle_ps(z, w, _MM_SHUFFLE(3, 1, 3, 1));
+
+		const __m512 x0y0x1y1 = _mm512_shuffle_ps(x0x2y0y2, x1x3y1y3, _MM_SHUFFLE(2, 0, 2, 0));
+		const __m512 x2y2x3y3 = _mm512_shuffle_ps(x0x2y0y2, x1x3y1y3, _MM_SHUFFLE(3, 1, 3, 1));
+		const __m512 z0w0z1w1 = _mm512_shuffle_ps(z0z2w0w2, z1z3w1w3, _MM_SHUFFLE(2, 0, 2, 0));
+		const __m512 z2w2z3w3 = _mm512_shuffle_ps(z0z2w0w2, z1z3w1w3, _MM_SHUFFLE(3, 1, 3, 1));
+
+		const __m512 m048c = _mm512_shuffle_ps(x0y0x1y1, z0w0z1w1, _MM_SHUFFLE(1, 0, 1, 0));
+		const __m512 m159d = _mm512_shuffle_ps(x0y0x1y1, z0w0z1w1, _MM_SHUFFLE(3, 2, 3, 2));
+		const __m512 m26ae = _mm512_shuffle_ps(x2y2x3y3, z2w2z3w3, _MM_SHUFFLE(1, 0, 1, 0));
+		const __m512 m37bf = _mm512_shuffle_ps(x2y2x3y3, z2w2z3w3, _MM_SHUFFLE(3, 2, 3, 2));
+
+		const __m512 m0415 = _mm512_shuffle_f32x4(m048c, m159d, _MM_SHUFFLE(1, 0, 1, 0));
+		const __m512 m2637 = _mm512_shuffle_f32x4(m26ae, m37bf, _MM_SHUFFLE(1, 0, 1, 0));
+		const __m512 m8c9d = _mm512_shuffle_f32x4(m048c, m159d, _MM_SHUFFLE(3, 2, 3, 2));
+		const __m512 maebf = _mm512_shuffle_f32x4(m26ae, m37bf, _MM_SHUFFLE(3, 2, 3, 2));
+
+		const __m512 m0123 = _mm512_shuffle_f32x4(m0415, m2637, _MM_SHUFFLE(2, 0, 2, 0));
+		const __m512 m4567 = _mm512_shuffle_f32x4(m0415, m2637, _MM_SHUFFLE(3, 1, 3, 1));
+		const __m512 m89ab = _mm512_shuffle_f32x4(m8c9d, maebf, _MM_SHUFFLE(2, 0, 2, 0));
+		const __m512 mcdef = _mm512_shuffle_f32x4(m8c9d, maebf, _MM_SHUFFLE(3, 1, 3, 1));
+
+		float* p = base->data();
+		_mm512_storeu_ps(p + 0, m0123);
+		_mm512_storeu_ps(p + 16, m4567);
+		_mm512_storeu_ps(p + 32, m89ab);
+		_mm512_storeu_ps(p + 48, mcdef);
+	}
 #	endif
 
 	VCL_STRONG_INLINE void load(
@@ -427,6 +466,41 @@ namespace Vcl {
 		_mm_storeu_ps(p + 20, _mm256_extractf128_ps(r25, 1));
 	}
 
+	VCL_STRONG_INLINE void store(
+		Eigen::Vector4f* base,
+		const __m256& x,
+		const __m256& y,
+		const __m256& z,
+		const __m256& w)
+	{
+		const __m256 x0y0x1y1x4y4x5y5 = _mm256_unpacklo_ps(x, y);
+		const __m256 x2y2x3y3x6y6x7y7 = _mm256_unpackhi_ps(x, y);
+		const __m256 z0w0z1w1z4w4z5w5 = _mm256_unpacklo_ps(z, w);
+		const __m256 z2w2z3w3z6w6z7w7 = _mm256_unpackhi_ps(z, w);
+
+		const __m256 x0y0x2y2x4y4x6y6 = _mm256_shuffle_ps(x0y0x1y1x4y4x5y5, x2y2x3y3x6y6x7y7, _MM_SHUFFLE(1, 0, 1, 0));
+		const __m256 x1y1x3y3x5y5x7y7 = _mm256_shuffle_ps(x0y0x1y1x4y4x5y5, x2y2x3y3x6y6x7y7, _MM_SHUFFLE(3, 2, 3, 2));
+
+		const __m256 z0w0z2w2z4w4z6w6 = _mm256_shuffle_ps(z0w0z1w1z4w4z5w5, z2w2z3w3z6w6z7w7, _MM_SHUFFLE(1, 0, 1, 0));
+		const __m256 z1w1z3w3z5w5z7w7 = _mm256_shuffle_ps(z0w0z1w1z4w4z5w5, z2w2z3w3z6w6z7w7, _MM_SHUFFLE(3, 2, 3, 2));
+
+		const __m256 r04 = _mm256_shuffle_ps(x0y0x2y2x4y4x6y6, z0w0z2w2z4w4z6w6, _MM_SHUFFLE(1, 0, 1, 0));
+		const __m256 r26 = _mm256_shuffle_ps(x0y0x2y2x4y4x6y6, z0w0z2w2z4w4z6w6, _MM_SHUFFLE(3, 2, 3, 2));
+
+		const __m256 r15 = _mm256_shuffle_ps(x1y1x3y3x5y5x7y7, z1w1z3w3z5w5z7w7, _MM_SHUFFLE(1, 0, 1, 0));
+		const __m256 r37 = _mm256_shuffle_ps(x1y1x3y3x5y5x7y7, z1w1z3w3z5w5z7w7, _MM_SHUFFLE(3, 2, 3, 2));
+
+		float* p = base->data();
+		_mm_storeu_ps(p + 0, _mm256_castps256_ps128(r04));
+		_mm_storeu_ps(p + 4, _mm256_castps256_ps128(r15));
+		_mm_storeu_ps(p + 8, _mm256_castps256_ps128(r26));
+		_mm_storeu_ps(p + 12, _mm256_castps256_ps128(r37));
+		_mm_storeu_ps(p + 16, _mm256_extractf128_ps(r04, 1));
+		_mm_storeu_ps(p + 20, _mm256_extractf128_ps(r15, 1));
+		_mm_storeu_ps(p + 24, _mm256_extractf128_ps(r26, 1));
+		_mm_storeu_ps(p + 28, _mm256_extractf128_ps(r37, 1));
+	}
+
 	VCL_STRONG_INLINE void load(
 		Eigen::Matrix<float8, 2, 1>& loaded,
 		const Eigen::Vector2f* base)
@@ -553,6 +627,30 @@ namespace Vcl {
 			_mm256_castsi256_ps(value(2).get(0)));
 	}
 
+	VCL_STRONG_INLINE void store(
+		Eigen::Vector4f* base,
+		const Eigen::Matrix<float8, 4, 1>& value)
+	{
+		store(
+			base,
+			value(0).get(0),
+			value(1).get(0),
+			value(2).get(0),
+			value(3).get(0));
+	}
+
+	VCL_STRONG_INLINE void store(
+		Eigen::Vector4i* base,
+		const Eigen::Matrix<int8, 4, 1>& value)
+	{
+		store(
+			reinterpret_cast<Eigen::Vector4f*>(base),
+			_mm256_castsi256_ps(value(0).get(0)),
+			_mm256_castsi256_ps(value(1).get(0)),
+			_mm256_castsi256_ps(value(2).get(0)),
+			_mm256_castsi256_ps(value(3).get(0)));
+	}
+
 #	ifdef VCL_VECTORIZE_AVX512
 	VCL_STRONG_INLINE void load(
 		Eigen::Matrix<float16, 2, 1>& loaded,
@@ -637,6 +735,30 @@ namespace Vcl {
 			_mm512_castsi512_ps(value(1).get(0)),
 			_mm512_castsi512_ps(value(2).get(0)));
 	}
+
+	VCL_STRONG_INLINE void store(
+		Eigen::Vector4f* base,
+		const Eigen::Matrix<float16, 4, 1>& value)
+	{
+		store(
+			base,
+			value(0).get(0),
+			value(1).get(0),
+			value(2).get(0),
+			value(3).get(0));
+	}
+
+	VCL_STRONG_INLINE void store(
+		Eigen::Vector4i* base,
+		const Eigen::Matrix<int16, 4, 1>& value)
+	{
+		store(
+			reinterpret_cast<Eigen::Vector4f*>(base),
+			_mm512_castsi512_ps(value(0).get(0)),
+			_mm512_castsi512_ps(value(1).get(0)),
+			_mm512_castsi512_ps(value(2).get(0)),
+			_mm512_castsi512_ps(value(3).get(0)));
+	}
 #	else
 	VCL_STRONG_INLINE void load(
 		Eigen::Matrix<float16, 2, 1>& loaded,
@@ -775,6 +897,33 @@ namespace Vcl {
 			_mm256_castsi256_ps(value(1).get(1)),
 			_mm256_castsi256_ps(value(2).get(1)));
 	}
+
+	VCL_STRONG_INLINE void store(
+		Eigen::Vector4f* base,
+		const Eigen::Matrix<float16, 4, 1>& value)
+	{
+		store(base + 0, value(0).get(0), value(1).get(0), value(2).get(0), value(3).get(0));
+		store(base + 8, value(0).get(1), value(1).get(1), value(2).get(1), value(3).get(1));
+	}
+
+	VCL_STRONG_INLINE void store(
+		Eigen::Vector4i* base,
+		const Eigen::Matrix<int16, 4, 1>& value)
+	{
+		store(
+			reinterpret_cast<Eigen::Vector4f*>(base),
+			_mm256_castsi256_ps(value(0).get(0)),
+			_mm256_castsi256_ps(value(1).get(0)),
+			_mm256_castsi256_ps(value(2).get(0)),
+			_mm256_castsi256_ps(value(3).get(0)));
+
+		store(
+			reinterpret_cast<Eigen::Vector4f*>(base) + 8,
+			_mm256_castsi256_ps(value(0).get(1)),
+			_mm256_castsi256_ps(value(1).get(1)),
+			_mm256_castsi256_ps(value(2).get(1)),
+			_mm256_castsi256_ps(value(3).get(1)));
+	}
 #	endif
 
 	VCL_STRONG_INLINE std::array<float8, 2> interleave(const float8& a, const float8& b) noexcept

diff --git a/src/libs/vcl.core/vcl/core/simd/memory_sse.h b/src/libs/vcl.core/vcl/core/simd/memory_sse.h
@@ -176,6 +176,30 @@ namespace Vcl {
 		_mm_storeu_ps(p + 8, rz2x3y3z3);
 	}
 
+	VCL_STRONG_INLINE void store(
+		Eigen::Vector4f* base,
+		const __m128& x,
+		const __m128& y,
+		const __m128& z,
+		const __m128& w) noexcept
+	{
+		const __m128 x0y0x1y1 = _mm_unpacklo_ps(x, y);
+		const __m128 x2y2x3y3 = _mm_unpackhi_ps(x, y);
+		const __m128 z0w0z1w1 = _mm_unpacklo_ps(z, w);
+		const __m128 z2w2z3w3 = _mm_unpackhi_ps(z, w);
+
+		const __m128 x0y0z0w0 = _mm_movelh_ps(x0y0x1y1, z0w0z1w1);
+		const __m128 x1y1z1w1 = _mm_movehl_ps(z0w0z1w1, x0y0x1y1);
+		const __m128 x2y2z2w2 = _mm_movelh_ps(x2y2x3y3, z2w2z3w3);
+		const __m128 x3y3z3w3 = _mm_movehl_ps(z2w2z3w3, x2y2x3y3);
+
+		float* p = base->data();
+		_mm_storeu_ps(p + 0, x0y0z0w0);
+		_mm_storeu_ps(p + 4, x1y1z1w1);
+		_mm_storeu_ps(p + 8, x2y2z2w2);
+		_mm_storeu_ps(p + 12, x3y3z3w3);
+	}
+
 	VCL_STRONG_INLINE void load(
 		Eigen::Matrix<float4, 2, 1>& loaded,
 		const Eigen::Vector2f* base)
@@ -287,6 +311,29 @@ namespace Vcl {
 			_mm_castsi128_ps(value(2).get(0)));
 	}
 
+	VCL_STRONG_INLINE void store(
+		Eigen::Vector4f* base,
+		const Eigen::Matrix<float4, 4, 1>& value)
+	{
+		store(
+			base,
+			value(0).get(0),
+			value(1).get(0),
+			value(2).get(0),
+			value(3).get(0));
+	}
+	VCL_STRONG_INLINE void store(
+		Eigen::Vector4i* base,
+		const Eigen::Matrix<int4, 4, 1>& value)
+	{
+		store(
+			reinterpret_cast<Eigen::Vector4f*>(base),
+			_mm_castsi128_ps(value(0).get(0)),
+			_mm_castsi128_ps(value(1).get(0)),
+			_mm_castsi128_ps(value(2).get(0)),
+			_mm_castsi128_ps(value(3).get(0)));
+	}
+
 	VCL_STRONG_INLINE std::array<float4, 2> interleave(const float4& a, const float4& b) noexcept
 	{
 		float4 low{ _mm_unpacklo_ps(a.get(0), b.get(0)) };
@@ -592,6 +639,33 @@ namespace Vcl {
 			_mm_castsi128_ps(value(2).get(1)));
 	}
 
+	VCL_STRONG_INLINE void store(
+		Eigen::Vector4f* base,
+		const Eigen::Matrix<float8, 4, 1>& value)
+	{
+		store(base + 0, value(0).get(0), value(1).get(0), value(2).get(0), value(3).get(0));
+		store(base + 4, value(0).get(1), value(1).get(1), value(2).get(1), value(3).get(1));
+	}
+
+	VCL_STRONG_INLINE void store(
+		Eigen::Vector4i* base,
+		const Eigen::Matrix<int8, 4, 1>& value)
+	{
+		store(
+			reinterpret_cast<Eigen::Vector4f*>(base) + 0,
+			_mm_castsi128_ps(value(0).get(0)),
+			_mm_castsi128_ps(value(1).get(0)),
+			_mm_castsi128_ps(value(2).get(0)),
+			_mm_castsi128_ps(value(3).get(0)));
+
+		store(
+			reinterpret_cast<Eigen::Vector4f*>(base) + 4,
+			_mm_castsi128_ps(value(0).get(1)),
+			_mm_castsi128_ps(value(1).get(1)),
+			_mm_castsi128_ps(value(2).get(1)),
+			_mm_castsi128_ps(value(3).get(1)));
+	}
+
 	VCL_STRONG_INLINE void store(
 		Eigen::Vector2f* base,
 		const Eigen::Matrix<float16, 2, 1>& value)
@@ -664,6 +738,48 @@ namespace Vcl {
 			_mm_castsi128_ps(value(2).get(3)));
 	}
 
+	VCL_STRONG_INLINE void store(
+		Eigen::Vector4f* base,
+		const Eigen::Matrix<float16, 4, 1>& value)
+	{
+		store(base + 0, value(0).get(0), value(1).get(0), value(2).get(0), value(3).get(0));
+		store(base + 4, value(0).get(1), value(1).get(1), value(2).get(1), value(3).get(1));
+		store(base + 8, value(0).get(2), value(1).get(2), value(2).get(2), value(3).get(2));
+		store(base + 12, value(0).get(3), value(1).get(3), value(2).get(3), value(3).get(3));
+	}
+
+	VCL_STRONG_INLINE void store(
+		Eigen::Vector4i* base,
+		const Eigen::Matrix<int16, 4, 1>& value)
+	{
+		store(
+			reinterpret_cast<Eigen::Vector4f*>(base) + 0,
+			_mm_castsi128_ps(value(0).get(0)),
+			_mm_castsi128_ps(value(1).get(0)),
+			_mm_castsi128_ps(value(2).get(0)),
+			_mm_castsi128_ps(value(3).get(0)));
+
+		store(
+			reinterpret_cast<Eigen::Vector4f*>(base) + 4,
+			_mm_castsi128_ps(value(0).get(1)),
+			_mm_castsi128_ps(value(1).get(1)),
+			_mm_castsi128_ps(value(2).get(1)),
+			_mm_castsi128_ps(value(3).get(1)));
+		store(
+			reinterpret_cast<Eigen::Vector4f*>(base) + 8,
+			_mm_castsi128_ps(value(0).get(2)),
+			_mm_castsi128_ps(value(1).get(2)),
+			_mm_castsi128_ps(value(2).get(2)),
+			_mm_castsi128_ps(value(3).get(2)));
+
+		store(
+			reinterpret_cast<Eigen::Vector4f*>(base) + 12,
+			_mm_castsi128_ps(value(0).get(3)),
+			_mm_castsi128_ps(value(1).get(3)),
+			_mm_castsi128_ps(value(2).get(3)),
+			_mm_castsi128_ps(value(3).get(3)));
+	}
+
 	VCL_STRONG_INLINE std::array<float8, 2> interleave(const float8& a, const float8& b)
 	{
 		const float8 low{ _mm_unpacklo_ps(a.get(0), b.get(0)), _mm_unpackhi_ps(a.get(0), b.get(0)) };