diff --git a/pystencils/include/aesni_rand.h b/pystencils/include/aesni_rand.h
index 36d5bbf6f8859b25920c569d9476e3485d5c5c2f..09327f27b8bc0b3cdd0b16cb8a64e3237b555797 100644
--- a/pystencils/include/aesni_rand.h
+++ b/pystencils/include/aesni_rand.h
@@ -4,7 +4,7 @@
 
 #include <emmintrin.h> // SSE2
 #include <wmmintrin.h> // AES
-#if defined(__AVX512VL__) || defined(__AVX512F__)
+#ifdef __AVX512VL__
 #include <immintrin.h> // AVX*
 #endif
 #include <cstdint>
@@ -33,7 +33,7 @@ QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) {
 
 QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
 {
-#if defined(__AVX512VL__) || defined(__AVX512F__)
+#ifdef __AVX512VL__
     return _mm_cvtepu32_ps(v);
 #else
     __m128i v2 = _mm_srli_epi32(v, 1);
@@ -46,7 +46,7 @@ QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
 
 QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
 {
-#if defined(__AVX512VL__) || defined(__AVX512F__)
+#ifdef __AVX512VL__
     return _mm_cvtepu64_pd(x);
 #else
     uint64 r[2];
@@ -110,5 +110,4 @@ QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
     rnd2 = r[1];
     rnd3 = r[2];
     rnd4 = r[3];
-}
-
+}
\ No newline at end of file
diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h
index a60f7fdf9857ac89b5c8037cfbfcf11a0012ec4c..27a7f0f142793ffbcdafc195f4a75eb584b078d4 100644
--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
@@ -1,9 +1,9 @@
 #include <cstdint>
 
-#ifdef __SSE__
+#ifdef __SSE4_1__
 #include <emmintrin.h> // SSE2
 #endif
-#ifdef __AVX__
+#ifdef __AVX2__
 #include <immintrin.h> // AVX*
 #else
 #include <smmintrin.h>  // SSE4
@@ -113,7 +113,8 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
     rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
 }
 
-#ifdef __SSE__
+#ifndef __CUDA_ARCH__
+#ifdef __SSE4_1__
 QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key)
 {
     __m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0));
@@ -156,7 +157,7 @@ QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
 #endif
 }
 
-#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5
+#if !defined(__AVX512VL__)&& !defined(__AVX512F__) && defined(__GNUC__) && __GNUC__ >= 5
 __attribute__((optimize("no-associative-math")))
 #endif
 QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
@@ -178,36 +179,36 @@ QUALIFIERS __m128d _uniform_double_hq(__m128i x, __m128i y)
     // convert 32 to 64 bit
     if (high)
     {
-        x = _mm_unpackhi_epi32(x, _mm_set1_epi32(0));
-        y = _mm_unpackhi_epi32(y, _mm_set1_epi32(0));;
+        x = _mm_unpackhi_epi32(x, _mm_setzero_si128());
+        y = _mm_unpackhi_epi32(y, _mm_setzero_si128());
     }
     else
     {
-        x = _mm_unpacklo_epi32(x, _mm_set1_epi32(0));
-        y = _mm_unpacklo_epi32(y, _mm_set1_epi32(0));;
+        x = _mm_unpacklo_epi32(x, _mm_setzero_si128());
+        y = _mm_unpacklo_epi32(y, _mm_setzero_si128());
     }
 
     // calculate z = x ^ y << (53 - 32))
-    __m128i z = _mm_sll_epi64(y, _mm_set_epi64x(53 - 32, 53 - 32));
+    __m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
     z = _mm_xor_si128(x, z);
 
     // convert uint64 to double
     __m128d rs = _my_cvtepu64_pd(z);
     // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
 #ifdef __FMA__
-    rs = _mm_fmadd_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE), _mm_set_pd1(TWOPOW53_INV_DOUBLE/2.0));
+    rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
 #else
-    rs = _mm_mul_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE));
-    rs = _mm_add_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE/2.0));
+    rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
+    rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
 #endif
 
     return rs;
 }
 
 
-QUALIFIERS void philox_float16(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i ctr3,
-                               uint32 key0, uint32 key1,
-                               __m128 & rnd1, __m128 & rnd2, __m128 & rnd3, __m128 & rnd4)
+QUALIFIERS void philox_float4(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i ctr3,
+                              uint32 key0, uint32 key1,
+                              __m128 & rnd1, __m128 & rnd2, __m128 & rnd3, __m128 & rnd4)
 {
     __m128i key[2] = {_mm_set1_epi32(key0), _mm_set1_epi32(key1)};
     __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
@@ -229,24 +230,24 @@ QUALIFIERS void philox_float16(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i
     rnd4 = _my_cvtepu32_ps(ctr[3]);
     // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
 #ifdef __FMA__
-    rnd1 = _mm_fmadd_ps(rnd1, _mm_set_ps1(TWOPOW32_INV_FLOAT), _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0));
-    rnd2 = _mm_fmadd_ps(rnd2, _mm_set_ps1(TWOPOW32_INV_FLOAT), _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0));
-    rnd3 = _mm_fmadd_ps(rnd3, _mm_set_ps1(TWOPOW32_INV_FLOAT), _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0));
-    rnd4 = _mm_fmadd_ps(rnd4, _mm_set_ps1(TWOPOW32_INV_FLOAT), _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0));
+    rnd1 = _mm_fmadd_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd2 = _mm_fmadd_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd3 = _mm_fmadd_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd4 = _mm_fmadd_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
 #else
-    rnd1 = _mm_mul_ps(rnd1, _mm_set_ps1(TWOPOW32_INV_FLOAT));
-    rnd1 = _mm_add_ps(rnd1, _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0f));
-    rnd2 = _mm_mul_ps(rnd2, _mm_set_ps1(TWOPOW32_INV_FLOAT));
-    rnd2 = _mm_add_ps(rnd2, _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0f));
-    rnd3 = _mm_mul_ps(rnd3, _mm_set_ps1(TWOPOW32_INV_FLOAT));
-    rnd3 = _mm_add_ps(rnd3, _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0f));
-    rnd4 = _mm_mul_ps(rnd4, _mm_set_ps1(TWOPOW32_INV_FLOAT));
-    rnd4 = _mm_add_ps(rnd4, _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0f));
+    rnd1 = _mm_mul_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd1 = _mm_add_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd2 = _mm_mul_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd2 = _mm_add_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd3 = _mm_mul_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd3 = _mm_add_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd4 = _mm_mul_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd4 = _mm_add_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
 #endif
 }
 
 
-QUALIFIERS void philox_double8(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i ctr3,
+QUALIFIERS void philox_double2(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i ctr3,
                                uint32 key0, uint32 key1,
                                __m128d & rnd1lo, __m128d & rnd1hi, __m128d & rnd2lo, __m128d & rnd2hi)
 {
@@ -270,7 +271,7 @@ QUALIFIERS void philox_double8(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i
 }
 #endif
 
-#ifdef __AVX__
+#ifdef __AVX2__
 QUALIFIERS void _philox4x32round(__m256i* ctr, __m256i* key)
 {
     __m256i lohi0a = _mm256_mul_epu32(ctr[0], _mm256_set1_epi32(PHILOX_M4x32_0));
@@ -313,7 +314,7 @@ QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
 #endif
 }
 
-#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5
+#if !defined(__AVX512VL__) && !defined(__AVX512F__) && defined(__GNUC__) && __GNUC__ >= 5
 __attribute__((optimize("no-associative-math")))
 #endif
 QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
@@ -329,52 +330,42 @@ QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
 #endif
 }
 
-QUALIFIERS __m256 _my256_set_ps1(const float v)
-{
-    return _mm256_set_ps(v, v, v, v, v, v, v, v);
-}
-
-QUALIFIERS __m256d _my256_set_pd1(const double v)
-{
-    return _mm256_set_pd(v, v, v, v);
-}
-
 template<bool high>
 QUALIFIERS __m256d _uniform_double_hq(__m256i x, __m256i y)
 {
     // convert 32 to 64 bit
     if (high)
     {
-        x = _mm256_unpackhi_epi32(x, _mm256_set1_epi32(0));
-        y = _mm256_unpackhi_epi32(y, _mm256_set1_epi32(0));;
+        x = _mm256_unpackhi_epi32(x, _mm256_setzero_si256());
+        y = _mm256_unpackhi_epi32(y, _mm256_setzero_si256());
     }
     else
     {
-        x = _mm256_unpacklo_epi32(x, _mm256_set1_epi32(0));
-        y = _mm256_unpacklo_epi32(y, _mm256_set1_epi32(0));;
+        x = _mm256_unpacklo_epi32(x, _mm256_setzero_si256());
+        y = _mm256_unpacklo_epi32(y, _mm256_setzero_si256());
     }
 
     // calculate z = x ^ y << (53 - 32))
-    __m256i z = _mm256_sll_epi64(y, _mm_set_epi64x(53 - 32, 53 - 32));
+    __m256i z = _mm256_sll_epi64(y, _mm_set1_epi64x(53 - 32));
     z = _mm256_xor_si256(x, z);
 
     // convert uint64 to double
     __m256d rs = _my256_cvtepu64_pd(z);
     // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
 #ifdef __FMA__
-    rs = _mm256_fmadd_pd(rs, _my256_set_pd1(TWOPOW53_INV_DOUBLE), _my256_set_pd1(TWOPOW53_INV_DOUBLE/2.0));
+    rs = _mm256_fmadd_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE), _mm256_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
 #else
-    rs = _mm256_mul_pd(rs, _my256_set_pd1(TWOPOW53_INV_DOUBLE));
-    rs = _mm256_add_pd(rs, _my256_set_pd1(TWOPOW53_INV_DOUBLE/2.0));
+    rs = _mm256_mul_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE));
+    rs = _mm256_add_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
 #endif
 
     return rs;
 }
 
 
-QUALIFIERS void philox_float32(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i ctr3,
-                               uint32 key0, uint32 key1,
-                               __m256 & rnd1, __m256 & rnd2, __m256 & rnd3, __m256 & rnd4)
+QUALIFIERS void philox_float4(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i ctr3,
+                              uint32 key0, uint32 key1,
+                              __m256 & rnd1, __m256 & rnd2, __m256 & rnd3, __m256 & rnd4)
 {
     __m256i key[2] = {_mm256_set1_epi32(key0), _mm256_set1_epi32(key1)};
     __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
@@ -396,26 +387,26 @@ QUALIFIERS void philox_float32(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i
     rnd4 = _my256_cvtepu32_ps(ctr[3]);
     // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
 #ifdef __FMA__
-    rnd1 = _mm256_fmadd_ps(rnd1, _my256_set_ps1(TWOPOW32_INV_FLOAT), _my256_set_ps1(TWOPOW32_INV_FLOAT/2.0));
-    rnd2 = _mm256_fmadd_ps(rnd2, _my256_set_ps1(TWOPOW32_INV_FLOAT), _my256_set_ps1(TWOPOW32_INV_FLOAT/2.0));
-    rnd3 = _mm256_fmadd_ps(rnd3, _my256_set_ps1(TWOPOW32_INV_FLOAT), _my256_set_ps1(TWOPOW32_INV_FLOAT/2.0));
-    rnd4 = _mm256_fmadd_ps(rnd4, _my256_set_ps1(TWOPOW32_INV_FLOAT), _my256_set_ps1(TWOPOW32_INV_FLOAT/2.0));
+    rnd1 = _mm256_fmadd_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd2 = _mm256_fmadd_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd3 = _mm256_fmadd_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd4 = _mm256_fmadd_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
 #else
-    rnd1 = _mm256_mul_ps(rnd1, _my256_set_ps1(TWOPOW32_INV_FLOAT));
-    rnd1 = _mm256_add_ps(rnd1, _my256_set_ps1(TWOPOW32_INV_FLOAT/2.0f));
-    rnd2 = _mm256_mul_ps(rnd2, _my256_set_ps1(TWOPOW32_INV_FLOAT));
-    rnd2 = _mm256_add_ps(rnd2, _my256_set_ps1(TWOPOW32_INV_FLOAT/2.0f));
-    rnd3 = _mm256_mul_ps(rnd3, _my256_set_ps1(TWOPOW32_INV_FLOAT));
-    rnd3 = _mm256_add_ps(rnd3, _my256_set_ps1(TWOPOW32_INV_FLOAT/2.0f));
-    rnd4 = _mm256_mul_ps(rnd4, _my256_set_ps1(TWOPOW32_INV_FLOAT));
-    rnd4 = _mm256_add_ps(rnd4, _my256_set_ps1(TWOPOW32_INV_FLOAT/2.0f));
+    rnd1 = _mm256_mul_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd1 = _mm256_add_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd2 = _mm256_mul_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd2 = _mm256_add_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd3 = _mm256_mul_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd3 = _mm256_add_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd4 = _mm256_mul_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd4 = _mm256_add_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
 #endif
 }
 
 
-QUALIFIERS void philox_double16(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i ctr3,
-                                uint32 key0, uint32 key1,
-                                __m256d & rnd1lo, __m256d & rnd1hi, __m256d & rnd2lo, __m256d & rnd2hi)
+QUALIFIERS void philox_double2(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i ctr3,
+                               uint32 key0, uint32 key1,
+                               __m256d & rnd1lo, __m256d & rnd1hi, __m256d & rnd2lo, __m256d & rnd2hi)
 {
     __m256i key[2] = {_mm256_set1_epi32(key0), _mm256_set1_epi32(key1)};
     __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
@@ -467,47 +458,37 @@ QUALIFIERS void _philox4x32bumpkey(__m512i* key)
     key[1] = _mm512_add_epi32(key[1], _mm512_set1_epi32(PHILOX_W32_1));
 }
 
-QUALIFIERS __m512 _my512_set_ps1(const float v)
-{
-    return _mm512_set_ps(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v);
-}
-
-QUALIFIERS __m512d _my512_set_pd1(const double v)
-{
-    return _mm512_set_pd(v, v, v, v, v, v, v, v);
-}
-
 template<bool high>
 QUALIFIERS __m512d _uniform_double_hq(__m512i x, __m512i y)
 {
     // convert 32 to 64 bit
     if (high)
     {
-        x = _mm512_unpackhi_epi32(x, _mm512_set1_epi32(0));
-        y = _mm512_unpackhi_epi32(y, _mm512_set1_epi32(0));;
+        x = _mm512_unpackhi_epi32(x, _mm512_setzero_si512());
+        y = _mm512_unpackhi_epi32(y, _mm512_setzero_si512());
     }
     else
     {
-        x = _mm512_unpacklo_epi32(x, _mm512_set1_epi32(0));
-        y = _mm512_unpacklo_epi32(y, _mm512_set1_epi32(0));;
+        x = _mm512_unpacklo_epi32(x, _mm512_setzero_si512());
+        y = _mm512_unpacklo_epi32(y, _mm512_setzero_si512());
     }
 
     // calculate z = x ^ y << (53 - 32))
-    __m512i z = _mm512_sll_epi64(y, _mm_set_epi64x(53 - 32, 53 - 32));
+    __m512i z = _mm512_sll_epi64(y, _mm_set1_epi64x(53 - 32));
     z = _mm512_xor_si512(x, z);
 
     // convert uint64 to double
     __m512d rs = _mm512_cvtepu64_pd(z);
     // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
-    rs = _mm512_fmadd_pd(rs, _my512_set_pd1(TWOPOW53_INV_DOUBLE), _my512_set_pd1(TWOPOW53_INV_DOUBLE/2.0));
+    rs = _mm512_fmadd_pd(rs, _mm512_set1_pd(TWOPOW53_INV_DOUBLE), _mm512_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
 
     return rs;
 }
 
 
-QUALIFIERS void philox_float64(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i ctr3,
-                               uint32 key0, uint32 key1,
-                               __m512 & rnd1, __m512 & rnd2, __m512 & rnd3, __m512 & rnd4)
+QUALIFIERS void philox_float4(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i ctr3,
+                              uint32 key0, uint32 key1,
+                              __m512 & rnd1, __m512 & rnd2, __m512 & rnd3, __m512 & rnd4)
 {
     __m512i key[2] = {_mm512_set1_epi32(key0), _mm512_set1_epi32(key1)};
     __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
@@ -528,16 +509,16 @@ QUALIFIERS void philox_float64(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i
     rnd3 = _mm512_cvtepu32_ps(ctr[2]);
     rnd4 = _mm512_cvtepu32_ps(ctr[3]);
     // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
-    rnd1 = _mm512_fmadd_ps(rnd1, _my512_set_ps1(TWOPOW32_INV_FLOAT), _my512_set_ps1(TWOPOW32_INV_FLOAT/2.0));
-    rnd2 = _mm512_fmadd_ps(rnd2, _my512_set_ps1(TWOPOW32_INV_FLOAT), _my512_set_ps1(TWOPOW32_INV_FLOAT/2.0));
-    rnd3 = _mm512_fmadd_ps(rnd3, _my512_set_ps1(TWOPOW32_INV_FLOAT), _my512_set_ps1(TWOPOW32_INV_FLOAT/2.0));
-    rnd4 = _mm512_fmadd_ps(rnd4, _my512_set_ps1(TWOPOW32_INV_FLOAT), _my512_set_ps1(TWOPOW32_INV_FLOAT/2.0));
+    rnd1 = _mm512_fmadd_ps(rnd1, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd2 = _mm512_fmadd_ps(rnd2, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd3 = _mm512_fmadd_ps(rnd3, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd4 = _mm512_fmadd_ps(rnd4, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
 }
 
 
-QUALIFIERS void philox_double32(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i ctr3,
-                                uint32 key0, uint32 key1,
-                                __m512d & rnd1lo, __m512d & rnd1hi, __m512d & rnd2lo, __m512d & rnd2hi)
+QUALIFIERS void philox_double2(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i ctr3,
+                               uint32 key0, uint32 key1,
+                               __m512d & rnd1lo, __m512d & rnd1hi, __m512d & rnd2lo, __m512d & rnd2hi)
 {
     __m512i key[2] = {_mm512_set1_epi32(key0), _mm512_set1_epi32(key1)};
     __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
@@ -558,4 +539,5 @@ QUALIFIERS void philox_double32(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512
     rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
 }
 #endif
+#endif