diff --git a/pystencils/include/aesni_rand.h b/pystencils/include/aesni_rand.h index 1e2b05bd28a01d64dcbd709d4018dcb7adfd69cd..c8b4089f86fb08c7740f72f17d288b707cff6a1c 100644 --- a/pystencils/include/aesni_rand.h +++ b/pystencils/include/aesni_rand.h @@ -18,6 +18,8 @@ #define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16) #define TWOPOW32_INV_FLOAT (2.3283064e-10f) +#include "myintrin.h" + typedef std::uint32_t uint32; typedef std::uint64_t uint64; @@ -36,35 +38,6 @@ QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) { return x; } -QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v) -{ -#ifdef __AVX512VL__ - return _mm_cvtepu32_ps(v); -#else - __m128i v2 = _mm_srli_epi32(v, 1); - __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1)); - __m128 v2f = _mm_cvtepi32_ps(v2); - __m128 v1f = _mm_cvtepi32_ps(v1); - return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f); -#endif -} - -#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 -__attribute__((optimize("no-associative-math"))) -#endif -QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x) -{ -#ifdef __AVX512VL__ - return _mm_cvtepu64_pd(x); -#else - __m128i xH = _mm_srli_epi64(x, 32); - xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84 - __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52 - __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 - return _mm_add_pd(f, _mm_castsi128_pd(xL)); -#endif -} - QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, uint32 key2, uint32 key3, diff --git a/pystencils/include/myintrin.h b/pystencils/include/myintrin.h new file mode 100644 index 0000000000000000000000000000000000000000..38304cb5f4c9cead701e4e902aef05862754d237 --- /dev/null +++ b/pystencils/include/myintrin.h @@ -0,0 +1,79 @@ +#pragma once + +#ifdef __SSE2__ +QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v) +{ +#ifdef __AVX512VL__ + return _mm_cvtepu32_ps(v); +#else + __m128i v2 = _mm_srli_epi32(v, 1); + __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1)); + __m128 v2f = _mm_cvtepi32_ps(v2); + __m128 v1f = _mm_cvtepi32_ps(v1); + return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f); +#endif +} + +QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, __m128i & R3) +{ + __m128i T0, T1, T2, T3; + T0 = _mm_unpacklo_epi32(R0, R1); + T1 = _mm_unpacklo_epi32(R2, R3); + T2 = _mm_unpackhi_epi32(R0, R1); + T3 = _mm_unpackhi_epi32(R2, R3); + R0 = _mm_unpacklo_epi64(T0, T1); + R1 = _mm_unpackhi_epi64(T0, T1); + R2 = _mm_unpacklo_epi64(T2, T3); + R3 = _mm_unpackhi_epi64(T2, T3); +} +#endif + +#ifdef __SSE4_1__ +#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 +__attribute__((optimize("no-associative-math"))) +#endif +QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x) +{ +#ifdef __AVX512VL__ + return _mm_cvtepu64_pd(x); +#else + __m128i xH = _mm_srli_epi64(x, 32); + xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84 + __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52 + __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 + return _mm_add_pd(f, _mm_castsi128_pd(xL)); +#endif +} +#endif + +#ifdef __AVX2__ +QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v) +{ +#ifdef __AVX512VL__ + return _mm256_cvtepu32_ps(v); +#else + __m256i v2 = _mm256_srli_epi32(v, 1); + __m256i v1 = _mm256_and_si256(v, _mm256_set1_epi32(1)); + __m256 v2f = _mm256_cvtepi32_ps(v2); + __m256 v1f = _mm256_cvtepi32_ps(v1); + return _mm256_add_ps(_mm256_add_ps(v2f, v2f), v1f); +#endif +} + +#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 +__attribute__((optimize("no-associative-math"))) +#endif +QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x) +{ +#ifdef __AVX512VL__ + return _mm256_cvtepu64_pd(x); +#else + __m256i xH = _mm256_srli_epi64(x, 32); + xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); // 2^84 + __m256i xL = _mm256_blend_epi16(x, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)), 0xcc); // 2^52 + __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 + return _mm256_add_pd(f, _mm256_castsi256_pd(xL)); +#endif +} +#endif + diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h index 423dbfab269176594bf782ba09bcb379b530a8bc..642cc9aa52c436620894006ce7dc902f030333d3 100644 --- a/pystencils/include/philox_rand.h +++ b/pystencils/include/philox_rand.h @@ -18,6 +18,8 @@ #define QUALIFIERS static __forceinline__ __device__ #endif +#include "myintrin.h" + #define PHILOX_W32_0 (0x9E3779B9) #define PHILOX_W32_1 (0xBB67AE85) #define PHILOX_M4x32_0 (0xD2511F53) @@ -144,35 +146,6 @@ QUALIFIERS void _philox4x32bumpkey(__m128i* key) key[1] = _mm_add_epi32(key[1], _mm_set1_epi32(PHILOX_W32_1)); } -QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v) -{ -#ifdef __AVX512VL__ - return _mm_cvtepu32_ps(v); -#else - __m128i v2 = _mm_srli_epi32(v, 1); - __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1)); - __m128 v2f = _mm_cvtepi32_ps(v2); - __m128 v1f = _mm_cvtepi32_ps(v1); - return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f); -#endif -} - -#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 -__attribute__((optimize("no-associative-math"))) -#endif -QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x) -{ -#ifdef __AVX512VL__ - return _mm_cvtepu64_pd(x); -#else - __m128i xH = _mm_srli_epi64(x, 32); - xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84 - __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52 - __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 - return _mm_add_pd(f, _mm_castsi128_pd(xL)); -#endif -} - template<bool high> QUALIFIERS __m128d _uniform_double_hq(__m128i x, __m128i y) { @@ -301,35 +274,6 @@ QUALIFIERS void _philox4x32bumpkey(__m256i* key) key[1] = _mm256_add_epi32(key[1], _mm256_set1_epi32(PHILOX_W32_1)); } -QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v) -{ -#ifdef __AVX512VL__ - return _mm256_cvtepu32_ps(v); -#else - __m256i v2 = _mm256_srli_epi32(v, 1); - __m256i v1 = _mm256_and_si256(v, _mm256_set1_epi32(1)); - __m256 v2f = _mm256_cvtepi32_ps(v2); - __m256 v1f = _mm256_cvtepi32_ps(v1); - return _mm256_add_ps(_mm256_add_ps(v2f, v2f), v1f); -#endif -} - -#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 -__attribute__((optimize("no-associative-math"))) -#endif -QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x) -{ -#ifdef __AVX512VL__ - return _mm256_cvtepu64_pd(x); -#else - __m256i xH = _mm256_srli_epi64(x, 32); - xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); // 2^84 - __m256i xL = _mm256_blend_epi16(x, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)), 0xcc); // 2^52 - __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 - return _mm256_add_pd(f, _mm256_castsi256_pd(xL)); -#endif -} - template<bool high> QUALIFIERS __m256d _uniform_double_hq(__m256i x, __m256i y) {