From b79bc19d99e71227be6d65a3facdfc65b2e6cc1e Mon Sep 17 00:00:00 2001
From: Michael Kuron <mkuron@icp.uni-stuttgart.de>
Date: Wed, 18 Sep 2019 10:29:00 +0200
Subject: [PATCH] Philox SIMD: Put wrappers for missing intrinsics into
 separate file

---
 pystencils/include/aesni_rand.h  | 31 +------------
 pystencils/include/myintrin.h    | 79 ++++++++++++++++++++++++++++++++
 pystencils/include/philox_rand.h | 60 +-----------------------
 3 files changed, 83 insertions(+), 87 deletions(-)
 create mode 100644 pystencils/include/myintrin.h

diff --git a/pystencils/include/aesni_rand.h b/pystencils/include/aesni_rand.h
index 1e2b05bd2..c8b4089f8 100644
--- a/pystencils/include/aesni_rand.h
+++ b/pystencils/include/aesni_rand.h
@@ -18,6 +18,8 @@
 #define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
 #define TWOPOW32_INV_FLOAT (2.3283064e-10f)
 
+#include "myintrin.h"
+
 typedef std::uint32_t uint32;
 typedef std::uint64_t uint64;
 
@@ -36,35 +38,6 @@ QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) {
     return x;
 }
 
-QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
-{
-#ifdef __AVX512VL__
-    return _mm_cvtepu32_ps(v);
-#else
-    __m128i v2 = _mm_srli_epi32(v, 1);
-    __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
-    __m128 v2f = _mm_cvtepi32_ps(v2);
-    __m128 v1f = _mm_cvtepi32_ps(v1);
-    return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
-#endif
-}
-
-#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5
-__attribute__((optimize("no-associative-math")))
-#endif
-QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
-{
-#ifdef __AVX512VL__
-    return _mm_cvtepu64_pd(x);
-#else
-    __m128i xH = _mm_srli_epi64(x, 32);
-    xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.)));          //  2^84
-    __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc);   //  2^52
-    __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.));     //  2^84 + 2^52
-    return _mm_add_pd(f, _mm_castsi128_pd(xL));
-#endif
-}
-
 
 QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
                               uint32 key0, uint32 key1, uint32 key2, uint32 key3,
diff --git a/pystencils/include/myintrin.h b/pystencils/include/myintrin.h
new file mode 100644
index 000000000..38304cb5f
--- /dev/null
+++ b/pystencils/include/myintrin.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#ifdef __SSE2__
+QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
+{
+#ifdef __AVX512VL__
+    return _mm_cvtepu32_ps(v);
+#else
+    __m128i v2 = _mm_srli_epi32(v, 1);
+    __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
+    __m128 v2f = _mm_cvtepi32_ps(v2);
+    __m128 v1f = _mm_cvtepi32_ps(v1);
+    return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
+#endif
+}
+
+QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, __m128i & R3)
+{
+    __m128i T0, T1, T2, T3;
+    T0  = _mm_unpacklo_epi32(R0, R1);
+    T1  = _mm_unpacklo_epi32(R2, R3);
+    T2  = _mm_unpackhi_epi32(R0, R1);
+    T3  = _mm_unpackhi_epi32(R2, R3);
+    R0  = _mm_unpacklo_epi64(T0, T1);
+    R1  = _mm_unpackhi_epi64(T0, T1);
+    R2  = _mm_unpacklo_epi64(T2, T3);
+    R3  = _mm_unpackhi_epi64(T2, T3);
+}
+#endif
+
+#ifdef __SSE4_1__
+#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5
+__attribute__((optimize("no-associative-math")))
+#endif
+QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
+{
+#ifdef __AVX512VL__
+    return _mm_cvtepu64_pd(x);
+#else
+    __m128i xH = _mm_srli_epi64(x, 32);
+    xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.)));          //  2^84
+    __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc);   //  2^52
+    __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.));     //  2^84 + 2^52
+    return _mm_add_pd(f, _mm_castsi128_pd(xL));
+#endif
+}
+#endif
+
+#ifdef __AVX2__
+QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
+{
+#ifdef __AVX512VL__
+    return _mm256_cvtepu32_ps(v);
+#else
+    __m256i v2 = _mm256_srli_epi32(v, 1);
+    __m256i v1 = _mm256_and_si256(v, _mm256_set1_epi32(1));
+    __m256 v2f = _mm256_cvtepi32_ps(v2);
+    __m256 v1f = _mm256_cvtepi32_ps(v1);
+    return _mm256_add_ps(_mm256_add_ps(v2f, v2f), v1f);
+#endif
+}
+
+#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5
+__attribute__((optimize("no-associative-math")))
+#endif
+QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
+{
+#ifdef __AVX512VL__
+    return _mm256_cvtepu64_pd(x);
+#else
+    __m256i xH = _mm256_srli_epi64(x, 32);
+    xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.)));          //  2^84
+    __m256i xL = _mm256_blend_epi16(x, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)), 0xcc);   //  2^52
+    __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.));     //  2^84 + 2^52
+    return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
+#endif
+}
+#endif
+
diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h
index 423dbfab2..642cc9aa5 100644
--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
@@ -18,6 +18,8 @@
 #define QUALIFIERS static __forceinline__ __device__
 #endif
 
+#include "myintrin.h"
+
 #define PHILOX_W32_0   (0x9E3779B9)
 #define PHILOX_W32_1   (0xBB67AE85)
 #define PHILOX_M4x32_0 (0xD2511F53)
@@ -144,35 +146,6 @@ QUALIFIERS void _philox4x32bumpkey(__m128i* key)
     key[1] = _mm_add_epi32(key[1], _mm_set1_epi32(PHILOX_W32_1));
 }
 
-QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
-{
-#ifdef __AVX512VL__
-    return _mm_cvtepu32_ps(v);
-#else
-    __m128i v2 = _mm_srli_epi32(v, 1);
-    __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
-    __m128 v2f = _mm_cvtepi32_ps(v2);
-    __m128 v1f = _mm_cvtepi32_ps(v1);
-    return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
-#endif
-}
-
-#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5
-__attribute__((optimize("no-associative-math")))
-#endif
-QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
-{
-#ifdef __AVX512VL__
-    return _mm_cvtepu64_pd(x);
-#else
-    __m128i xH = _mm_srli_epi64(x, 32);
-    xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.)));          //  2^84
-    __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc);   //  2^52
-    __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.));     //  2^84 + 2^52
-    return _mm_add_pd(f, _mm_castsi128_pd(xL));
-#endif
-}
-
 template<bool high>
 QUALIFIERS __m128d _uniform_double_hq(__m128i x, __m128i y)
 {
@@ -301,35 +274,6 @@ QUALIFIERS void _philox4x32bumpkey(__m256i* key)
     key[1] = _mm256_add_epi32(key[1], _mm256_set1_epi32(PHILOX_W32_1));
 }
 
-QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
-{
-#ifdef __AVX512VL__
-    return _mm256_cvtepu32_ps(v);
-#else
-    __m256i v2 = _mm256_srli_epi32(v, 1);
-    __m256i v1 = _mm256_and_si256(v, _mm256_set1_epi32(1));
-    __m256 v2f = _mm256_cvtepi32_ps(v2);
-    __m256 v1f = _mm256_cvtepi32_ps(v1);
-    return _mm256_add_ps(_mm256_add_ps(v2f, v2f), v1f);
-#endif
-}
-
-#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5
-__attribute__((optimize("no-associative-math")))
-#endif
-QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
-{
-#ifdef __AVX512VL__
-    return _mm256_cvtepu64_pd(x);
-#else
-    __m256i xH = _mm256_srli_epi64(x, 32);
-    xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.)));          //  2^84
-    __m256i xL = _mm256_blend_epi16(x, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)), 0xcc);   //  2^52
-    __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.));     //  2^84 + 2^52
-    return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
-#endif
-}
-
 template<bool high>
 QUALIFIERS __m256d _uniform_double_hq(__m256i x, __m256i y)
 {
-- 
GitLab