Compare revisions

d45ad465 · d45ad465 · d45ad465 · d45ad465 · d45ad465 · d45ad465
--- a/src/pystencils/include/aesni_rand.h
+++ b/src/pystencils/include/aesni_rand.h
+/*
+Copyright 2010-2011, D. E. Shaw Research. All rights reserved.
+Copyright 2019-2025, Michael Kuron.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+#if defined(_MSC_VER) && defined(_M_ARM64)
+#define __ARM_NEON
+#endif
+
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#if defined(__ARM_FEATURE_SVE)
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+#endif
+#else
 #include <emmintrin.h> // SSE2
 #include <wmmintrin.h> // AES
 #ifdef __AVX__
@@ -8,6 +53,7 @@
 #include <immintrin.h> // FMA
 #endif
 #endif
+#endif
 #include <cstdint>
 #include <array>
 #include <map>
@@ -21,6 +67,14 @@
 typedef std::uint32_t uint32;
 typedef std::uint64_t uint64;

+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
+typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+typedef svfloat64_t svfloat64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+#elif defined(__ARM_FEATURE_SVE)
+typedef svfloat32_t svfloat32_st;
+typedef svfloat64_t svfloat64_st;
+#endif
+
 template <typename T, std::size_t Alignment>
 class AlignedAllocator
 {
@@ -36,7 +90,14 @@ public:
        if (n == 0) {
            return nullptr;
        }
-        void * const p = _mm_malloc(n*sizeof(T), Alignment);
+#ifdef _WIN32
+        void * const p = _aligned_malloc(n*sizeof(T), Alignment);
+#else
+        void * p;
+        if (posix_memalign(&p, Alignment, n*sizeof(T)) != 0) {
+          p = nullptr;
+        }
+#endif
        if (p == nullptr) {
            throw std::bad_alloc();
        }
@@ -44,7 +105,11 @@ public:
    }

    void deallocate(T * const p, const std::size_t n) const {
-        _mm_free(p);
+#ifdef _WIN32
+        _aligned_free(p);
+#else
+        free(p);
+#endif
    }
 };

@@ -54,7 +119,7 @@ using AlignedMap = std::map<Key, T, std::less<Key>, AlignedAllocator<std::pair<c
 #if defined(__AES__) || defined(_MSC_VER)
 QUALIFIERS __m128i aesni_keygen_assist(__m128i temp1, __m128i temp2) {
    __m128i temp3; 
-    temp2 = _mm_shuffle_epi32(temp2 ,0xff); 
+    temp2 = _mm_shuffle_epi32(temp2, 0xff); 
    temp3 = _mm_slli_si128(temp1, 0x4);
    temp1 = _mm_xor_si128(temp1, temp3);
    temp3 = _mm_slli_si128(temp3, 0x4);
@@ -241,6 +306,19 @@ QUALIFIERS __m128d _uniform_double_hq(__m128i x, __m128i y)
    return rs;
 }

+QUALIFIERS void transpose128(__m128i & R0, __m128i & R1, __m128i & R2, __m128i & R3)
+{
+    __m128i T0, T1, T2, T3;
+    T0  = _mm_unpacklo_epi32(R0, R1);
+    T1  = _mm_unpacklo_epi32(R2, R3);
+    T2  = _mm_unpackhi_epi32(R0, R1);
+    T3  = _mm_unpackhi_epi32(R2, R3);
+    R0  = _mm_unpacklo_epi64(T0, T1);
+    R1  = _mm_unpackhi_epi64(T0, T1);
+    R2  = _mm_unpacklo_epi64(T2, T3);
+    R3  = _mm_unpackhi_epi64(T2, T3);
+}
+

 QUALIFIERS void aesni_float4(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i ctr3,
                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
@@ -249,12 +327,12 @@ QUALIFIERS void aesni_float4(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i c
    // pack input and call AES
    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
    __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
-    _MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
+    transpose128(ctr[0], ctr[1], ctr[2], ctr[3]);
    for (int i = 0; i < 4; ++i)
    {
        ctr[i] = aesni1xm128i(ctr[i], k128);
    }
-    _MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
+    transpose128(ctr[0], ctr[1], ctr[2], ctr[3]);

    // convert uint32 to float
    rnd1 = _my_cvtepu32_ps(ctr[0]);
@@ -287,12 +365,12 @@ QUALIFIERS void aesni_double2(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i
    // pack input and call AES
    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
    __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
-    _MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
+    transpose128(ctr[0], ctr[1], ctr[2], ctr[3]);
    for (int i = 0; i < 4; ++i)
    {
        ctr[i] = aesni1xm128i(ctr[i], k128);
    }
-    _MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
+    transpose128(ctr[0], ctr[1], ctr[2], ctr[3]);

    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
@@ -408,6 +486,20 @@ QUALIFIERS __m256d _uniform_double_hq(__m256i x, __m256i y)
 }


+QUALIFIERS void transpose128(__m256i & R0, __m256i & R1, __m256i & R2, __m256i & R3)
+{
+    __m256i T0, T1, T2, T3;
+    T0  = _mm256_unpacklo_epi32(R0, R1);
+    T1  = _mm256_unpacklo_epi32(R2, R3);
+    T2  = _mm256_unpackhi_epi32(R0, R1);
+    T3  = _mm256_unpackhi_epi32(R2, R3);
+    R0  = _mm256_unpacklo_epi64(T0, T1);
+    R1  = _mm256_unpackhi_epi64(T0, T1);
+    R2  = _mm256_unpacklo_epi64(T2, T3);
+    R3  = _mm256_unpackhi_epi64(T2, T3);
+}
+
+
 QUALIFIERS void aesni_float4(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i ctr3,
                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
                              __m256 & rnd1, __m256 & rnd2, __m256 & rnd3, __m256 & rnd4)
@@ -415,33 +507,12 @@ QUALIFIERS void aesni_float4(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i c
    // pack input and call AES
    __m256i k256 = _mm256_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0);
    __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
-    __m128i a[4], b[4];
-    for (int i = 0; i < 4; ++i)
-    {
-        a[i] = _mm256_extractf128_si256(ctr[i], 0);
-        b[i] = _mm256_extractf128_si256(ctr[i], 1);
-    }
-    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
-    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
-    for (int i = 0; i < 4; ++i)
-    {
-        ctr[i] = _my256_set_m128i(b[i], a[i]);
-    }
+    transpose128(ctr[0], ctr[1], ctr[2], ctr[3]);
    for (int i = 0; i < 4; ++i)
    {
        ctr[i] = aesni1xm128i(ctr[i], k256);
    }
-    for (int i = 0; i < 4; ++i)
-    {
-        a[i] = _mm256_extractf128_si256(ctr[i], 0);
-        b[i] = _mm256_extractf128_si256(ctr[i], 1);
-    }
-    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
-    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
-    for (int i = 0; i < 4; ++i)
-    {
-        ctr[i] = _my256_set_m128i(b[i], a[i]);
-    }
+    transpose128(ctr[0], ctr[1], ctr[2], ctr[3]);

    // convert uint32 to float
    rnd1 = _my256_cvtepu32_ps(ctr[0]);
@@ -474,33 +545,12 @@ QUALIFIERS void aesni_double2(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i
    // pack input and call AES
    __m256i k256 = _mm256_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0);
    __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
-    __m128i a[4], b[4];
-    for (int i = 0; i < 4; ++i)
-    {
-        a[i] = _mm256_extractf128_si256(ctr[i], 0);
-        b[i] = _mm256_extractf128_si256(ctr[i], 1);
-    }
-    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
-    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
-    for (int i = 0; i < 4; ++i)
-    {
-        ctr[i] = _my256_set_m128i(b[i], a[i]);
-    }
+    transpose128(ctr[0], ctr[1], ctr[2], ctr[3]);
    for (int i = 0; i < 4; ++i)
    {
        ctr[i] = aesni1xm128i(ctr[i], k256);
    }
-    for (int i = 0; i < 4; ++i)
-    {
-        a[i] = _mm256_extractf128_si256(ctr[i], 0);
-        b[i] = _mm256_extractf128_si256(ctr[i], 1);
-    }
-    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
-    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
-    for (int i = 0; i < 4; ++i)
-    {
-        ctr[i] = _my256_set_m128i(b[i], a[i]);
-    }
+    transpose128(ctr[0], ctr[1], ctr[2], ctr[3]);

    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
@@ -622,6 +672,20 @@ QUALIFIERS __m512d _uniform_double_hq(__m512i x, __m512i y)
 }


+QUALIFIERS void transpose128(__m512i & R0, __m512i & R1, __m512i & R2, __m512i & R3)
+{
+    __m512i T0, T1, T2, T3;
+    T0  = _mm512_unpacklo_epi32(R0, R1);
+    T1  = _mm512_unpacklo_epi32(R2, R3);
+    T2  = _mm512_unpackhi_epi32(R0, R1);
+    T3  = _mm512_unpackhi_epi32(R2, R3);
+    R0  = _mm512_unpacklo_epi64(T0, T1);
+    R1  = _mm512_unpackhi_epi64(T0, T1);
+    R2  = _mm512_unpacklo_epi64(T2, T3);
+    R3  = _mm512_unpackhi_epi64(T2, T3);
+}
+
+
 QUALIFIERS void aesni_float4(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i ctr3,
                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
                              __m512 & rnd1, __m512 & rnd2, __m512 & rnd3, __m512 & rnd4)
@@ -630,41 +694,12 @@ QUALIFIERS void aesni_float4(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i c
    __m512i k512 = _mm512_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0,
                                    key3, key2, key1, key0, key3, key2, key1, key0);
    __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
-    __m128i a[4], b[4], c[4], d[4];
-    for (int i = 0; i < 4; ++i)
-    {
-        a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
-        b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
-        c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
-        d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
-    }
-    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
-    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
-    _MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
-    _MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
-    for (int i = 0; i < 4; ++i)
-    {
-        ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
-    }
+    transpose128(ctr[0], ctr[1], ctr[2], ctr[3]);
    for (int i = 0; i < 4; ++i)
    {
        ctr[i] = aesni1xm128i(ctr[i], k512);
    }
-    for (int i = 0; i < 4; ++i)
-    {
-        a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
-        b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
-        c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
-        d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
-    }
-    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
-    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
-    _MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
-    _MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
-    for (int i = 0; i < 4; ++i)
-    {
-        ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
-    }
+    transpose128(ctr[0], ctr[1], ctr[2], ctr[3]);

    // convert uint32 to float
    rnd1 = _mm512_cvtepu32_ps(ctr[0]);
@@ -687,41 +722,12 @@ QUALIFIERS void aesni_double2(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i
    __m512i k512 = _mm512_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0,
                                    key3, key2, key1, key0, key3, key2, key1, key0);
    __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
-    __m128i a[4], b[4], c[4], d[4];
-    for (int i = 0; i < 4; ++i)
-    {
-        a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
-        b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
-        c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
-        d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
-    }
-    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
-    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
-    _MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
-    _MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
-    for (int i = 0; i < 4; ++i)
-    {
-        ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
-    }
+    transpose128(ctr[0], ctr[1], ctr[2], ctr[3]);
    for (int i = 0; i < 4; ++i)
    {
        ctr[i] = aesni1xm128i(ctr[i], k512);
    }
-    for (int i = 0; i < 4; ++i)
-    {
-        a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
-        b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
-        c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
-        d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
-    }
-    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
-    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
-    _MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
-    _MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
-    for (int i = 0; i < 4; ++i)
-    {
-        ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
-    }
+    transpose128(ctr[0], ctr[1], ctr[2], ctr[3]);

    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
@@ -771,3 +777,466 @@ QUALIFIERS void aesni_double2(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr
 }
 #endif

+#if defined(__ARM_NEON)
+QUALIFIERS uint32x4_t aesni_keygen_assist(uint32x4_t temp1, uint32x4_t temp2) {
+    uint32x4_t temp3; 
+    temp2 = vdupq_laneq_u32(temp2, 3); 
+    temp3 = vextq_u32(vdupq_n_u32(0), temp1,  4 - 1);
+    temp1 = veorq_u32(temp1, temp3);
+    temp3 = vextq_u32(vdupq_n_u32(0), temp3,  4 - 1);
+    temp1 = veorq_u32(temp1, temp3);
+    temp3 = vextq_u32(vdupq_n_u32(0), temp3,  4 - 1);
+    temp1 = veorq_u32(temp1, temp3);
+    temp1 = veorq_u32(temp1, temp2); 
+    return temp1; 
+}
+
+QUALIFIERS uint32x4_t aesni_keygen_assist(uint32x4_t k, const unsigned char imm8)
+{
+    uint8x16_t a = vreinterpretq_u8_u32(k);
+    a = vaeseq_u8(a, vdupq_n_u8(0));
+    uint8x16_t dest = {
+        a[0x4], a[0x1], a[0xE], a[0xB],
+        a[0x1], a[0xE], a[0xB], a[0x4],
+        a[0xC], a[0x9], a[0x6], a[0x3],
+        a[0x9], a[0x6], a[0x3], a[0xC],
+    };
+    return vreinterpretq_u32_u8(veorq_u8(dest, vreinterpretq_u8_u32(uint32x4_t{0, imm8, 0, imm8})));
+}
+
+QUALIFIERS std::array<uint32x4_t,11> aesni_keygen(uint32x4_t k) {
+    std::array<uint32x4_t,11> rk;
+    uint32x4_t tmp;
+    
+    rk[0] = k;
+    
+    tmp = aesni_keygen_assist(k, 0x1);
+    k = aesni_keygen_assist(k, tmp);
+    rk[1] = k;
+    
+    tmp = aesni_keygen_assist(k, 0x2);
+    k = aesni_keygen_assist(k, tmp);
+    rk[2] = k;
+    
+    tmp = aesni_keygen_assist(k, 0x4);
+    k = aesni_keygen_assist(k, tmp);
+    rk[3] = k;
+    
+    tmp = aesni_keygen_assist(k, 0x8);
+    k = aesni_keygen_assist(k, tmp);
+    rk[4] = k;
+    
+    tmp = aesni_keygen_assist(k, 0x10);
+    k = aesni_keygen_assist(k, tmp);
+    rk[5] = k;
+    
+    tmp = aesni_keygen_assist(k, 0x20);
+    k = aesni_keygen_assist(k, tmp);
+    rk[6] = k;
+    
+    tmp = aesni_keygen_assist(k, 0x40);
+    k = aesni_keygen_assist(k, tmp);
+    rk[7] = k;
+    
+    tmp = aesni_keygen_assist(k, 0x80);
+    k = aesni_keygen_assist(k, tmp);
+    rk[8] = k;
+    
+    tmp = aesni_keygen_assist(k, 0x1b);
+    k = aesni_keygen_assist(k, tmp);
+    rk[9] = k;
+    
+    tmp = aesni_keygen_assist(k, 0x36);
+    k = aesni_keygen_assist(k, tmp);
+    rk[10] = k;
+    
+    return rk;
+}
+
+QUALIFIERS const std::array<uint32x4_t,11> & aesni_roundkeys(const uint32x4_t & k128) {
+    alignas(16) std::array<uint32,4> a;
+    vst1q_u32((uint32_t*) a.data(), k128);
+    
+    static AlignedMap<std::array<uint32,4>, std::array<uint32x4_t,11>> roundkeys;
+    
+    if(roundkeys.find(a) == roundkeys.end()) {
+        auto rk = aesni_keygen(k128);
+        roundkeys[a] = rk;
+    }
+    return roundkeys[a];
+}
+
+QUALIFIERS uint32x4_t aesni1xm128i(const uint32x4_t & in, const uint32x4_t & k0) {
+    auto k = aesni_roundkeys(k0);
+    uint8x16_t x = vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_u32(in), vreinterpretq_u8_u32(k[0])));
+    x = vaesmcq_u8(vaeseq_u8(x, vreinterpretq_u8_u32(k[1])));
+    x = vaesmcq_u8(vaeseq_u8(x, vreinterpretq_u8_u32(k[2])));
+    x = vaesmcq_u8(vaeseq_u8(x, vreinterpretq_u8_u32(k[3])));
+    x = vaesmcq_u8(vaeseq_u8(x, vreinterpretq_u8_u32(k[4])));
+    x = vaesmcq_u8(vaeseq_u8(x, vreinterpretq_u8_u32(k[5])));
+    x = vaesmcq_u8(vaeseq_u8(x, vreinterpretq_u8_u32(k[6])));
+    x = vaesmcq_u8(vaeseq_u8(x, vreinterpretq_u8_u32(k[7])));
+    x = vaesmcq_u8(vaeseq_u8(x, vreinterpretq_u8_u32(k[8])));
+    x = vaeseq_u8(x, vreinterpretq_u8_u32(k[9]));
+    x = veorq_u8(x, vreinterpretq_u8_u32(k[10]));
+    return vreinterpretq_u32_u8(x);
+}
+
+QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
+                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                             float & rnd1, float & rnd2, float & rnd3, float & rnd4)
+{
+    // pack input and call AES
+    uint32x4_t c128 = {ctr0, ctr1, ctr2, ctr3};
+    uint32x4_t k128 = {key0, key1, key2, key3};
+    c128 = aesni1xm128i(c128, k128);
+
+    // convert uint32 to float
+    float32x4_t rs = vcvtq_f32_u32(c128);
+    // calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+    rs = vfmaq_f32(vdupq_n_f32(TWOPOW32_INV_FLOAT/2.0), vdupq_n_f32(TWOPOW32_INV_FLOAT), rs);
+
+    // store result
+    rnd1 = vgetq_lane_f32(rs, 0);
+    rnd2 = vgetq_lane_f32(rs, 1);
+    rnd3 = vgetq_lane_f32(rs, 2);
+    rnd4 = vgetq_lane_f32(rs, 3);
+}
+
+QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              double & rnd1, double & rnd2)
+{
+    // pack input and call AES
+    uint32x4_t c128 = {ctr0, ctr1, ctr2, ctr3};
+    uint32x4_t k128 = {key0, key1, key2, key3};
+    c128 = aesni1xm128i(c128, k128);
+
+    // convert 32 to 64 bit and put 0th and 2nd element into x, 1st and 3rd element into y
+    uint32x4_t x = vandq_u32(c128, uint32x4_t{0xffffffff, 0, 0xffffffff, 0});
+    uint32x4_t y = vandq_u32(c128, uint32x4_t{0, 0xffffffff, 0, 0xffffffff});
+    y = vextq_u32(y, vdupq_n_u32(0), 1);
+
+    // calculate z = x ^ y << (53 - 32))
+    uint64x2_t z = vshlq_n_u64(vreinterpretq_u64_u32(y), 53 - 32);
+    z = veorq_u64(vreinterpretq_u64_u32(x), z);
+
+    // convert uint64 to double
+    float64x2_t rs = vcvtq_f64_u64(z);
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+    rs = vfmaq_f64(vdupq_n_f64(TWOPOW53_INV_DOUBLE/2.0), vdupq_n_f64(TWOPOW53_INV_DOUBLE), rs);
+
+    // store result
+   rnd1 = vgetq_lane_f64(rs, 0);
+   rnd2 = vgetq_lane_f64(rs, 1);
+}
+
+template<bool high>
+QUALIFIERS float64x2_t _uniform_double_hq(uint32x4_t x, uint32x4_t y)
+{
+    // convert 32 to 64 bit
+    if (high)
+    {
+        x = vzip2q_u32(x, vdupq_n_u32(0));
+        y = vzip2q_u32(y, vdupq_n_u32(0));
+    }
+    else
+    {
+        x = vzip1q_u32(x, vdupq_n_u32(0));
+        y = vzip1q_u32(y, vdupq_n_u32(0));
+    }
+
+    // calculate z = x ^ y << (53 - 32))
+    uint64x2_t z = vshlq_n_u64(vreinterpretq_u64_u32(y), 53 - 32);
+    z = veorq_u64(vreinterpretq_u64_u32(x), z);
+
+    // convert uint64 to double
+    float64x2_t rs = vcvtq_f64_u64(z);
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+    rs = vfmaq_f64(vdupq_n_f64(TWOPOW53_INV_DOUBLE/2.0), vdupq_n_f64(TWOPOW53_INV_DOUBLE), rs);
+
+    return rs;
+}
+
+
+QUALIFIERS void transpose128(uint32x4_t & R0, uint32x4_t & R1, uint32x4_t & R2, uint32x4_t & R3)
+{
+    uint32x4_t T0, T1, T2, T3;
+    T0  = vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(R0), vreinterpretq_u64_u32(R2)));
+    T1  = vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(R1), vreinterpretq_u64_u32(R3)));
+    T2  = vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(R0), vreinterpretq_u64_u32(R2)));
+    T3  = vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(R1), vreinterpretq_u64_u32(R3)));
+    R0  = vtrn1q_u32(T0, T1);
+    R1  = vtrn2q_u32(T0, T1);
+    R2  = vtrn1q_u32(T2, T3);
+    R3  = vtrn2q_u32(T2, T3);
+}
+
+
+QUALIFIERS void aesni_float4(uint32x4_t ctr0, uint32x4_t ctr1, uint32x4_t ctr2, uint32x4_t ctr3,
+                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                             float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
+{
+    uint32x4_t k128 = {key0, key1, key2, key3};
+    uint32x4_t ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    transpose128(ctr[0], ctr[1], ctr[2], ctr[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = aesni1xm128i(ctr[i], k128);
+    }
+    transpose128(ctr[0], ctr[1], ctr[2], ctr[3]);
+
+    // convert uint32 to float
+    rnd1 = vcvtq_f32_u32(ctr[0]);
+    rnd2 = vcvtq_f32_u32(ctr[1]);
+    rnd3 = vcvtq_f32_u32(ctr[2]);
+    rnd4 = vcvtq_f32_u32(ctr[3]);
+    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+    rnd1 = vfmaq_f32(vdupq_n_f32(TWOPOW32_INV_FLOAT/2.0), vdupq_n_f32(TWOPOW32_INV_FLOAT), rnd1);
+    rnd2 = vfmaq_f32(vdupq_n_f32(TWOPOW32_INV_FLOAT/2.0), vdupq_n_f32(TWOPOW32_INV_FLOAT), rnd2);
+    rnd3 = vfmaq_f32(vdupq_n_f32(TWOPOW32_INV_FLOAT/2.0), vdupq_n_f32(TWOPOW32_INV_FLOAT), rnd3);
+    rnd4 = vfmaq_f32(vdupq_n_f32(TWOPOW32_INV_FLOAT/2.0), vdupq_n_f32(TWOPOW32_INV_FLOAT), rnd4);
+}
+
+
+QUALIFIERS void aesni_double2(uint32x4_t ctr0, uint32x4_t ctr1, uint32x4_t ctr2, uint32x4_t ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              float64x2_t & rnd1lo, float64x2_t & rnd1hi, float64x2_t & rnd2lo, float64x2_t & rnd2hi)
+{
+    uint32x4_t k128 = {key0, key1, key2, key3};
+    uint32x4_t ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    transpose128(ctr[0], ctr[1], ctr[2], ctr[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = aesni1xm128i(ctr[i], k128);
+    }
+    transpose128(ctr[0], ctr[1], ctr[2], ctr[3]);
+
+    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
+    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
+    rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
+    rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
+}
+
+QUALIFIERS void aesni_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
+                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                             float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
+{
+    uint32x4_t ctr0v = vdupq_n_u32(ctr0);
+    uint32x4_t ctr2v = vdupq_n_u32(ctr2);
+    uint32x4_t ctr3v = vdupq_n_u32(ctr3);
+
+    aesni_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, rnd2, rnd3, rnd4);
+}
+
+#ifndef _MSC_VER
+QUALIFIERS void aesni_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
+                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                             float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
+{
+    aesni_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, key2, key3, rnd1, rnd2, rnd3, rnd4);
+}
+#endif
+
+QUALIFIERS void aesni_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              float64x2_t & rnd1lo, float64x2_t & rnd1hi, float64x2_t & rnd2lo, float64x2_t & rnd2hi)
+{
+    uint32x4_t ctr0v = vdupq_n_u32(ctr0);
+    uint32x4_t ctr2v = vdupq_n_u32(ctr2);
+    uint32x4_t ctr3v = vdupq_n_u32(ctr3);
+
+    aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+}
+
+QUALIFIERS void aesni_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              float64x2_t & rnd1, float64x2_t & rnd2)
+{
+    uint32x4_t ctr0v = vdupq_n_u32(ctr0);
+    uint32x4_t ctr2v = vdupq_n_u32(ctr2);
+    uint32x4_t ctr3v = vdupq_n_u32(ctr3);
+
+    float64x2_t ignore;
+    aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, ignore, rnd2, ignore);
+}
+
+#ifndef _MSC_VER
+QUALIFIERS void aesni_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              float64x2_t & rnd1, float64x2_t & rnd2)
+{
+    aesni_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, key2, key3, rnd1, rnd2);
+}
+#endif
+#endif
+
+#if defined(__ARM_FEATURE_SVE)
+template<bool high>
+QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y)
+{
+    // convert 32 to 64 bit
+    if (high)
+    {
+        x = svzip2_u32(x, svdup_u32(0));
+        y = svzip2_u32(y, svdup_u32(0));
+    }
+    else
+    {
+        x = svzip1_u32(x, svdup_u32(0));
+        y = svzip1_u32(y, svdup_u32(0));
+    }
+
+    // calculate z = x ^ y << (53 - 32))
+    svuint64_t z = svlsl_n_u64_x(svptrue_b64(), svreinterpret_u64_u32(y), 53 - 32);
+    z = sveor_u64_x(svptrue_b64(), svreinterpret_u64_u32(x), z);
+
+    // convert uint64 to double
+    svfloat64_t rs = svcvt_f64_u64_x(svptrue_b64(), z);
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+    rs = svmad_f64_x(svptrue_b64(), rs, svdup_f64(TWOPOW53_INV_DOUBLE), svdup_f64(TWOPOW53_INV_DOUBLE/2.0));
+
+    return rs;
+}
+
+
+QUALIFIERS void transpose128(svuint32x4_t & R)
+{
+    svuint32_t T0, T1, T2, T3;
+    T0  = svreinterpret_u32_u64(svtrn1_u64(svreinterpret_u64_u32(svget4_u32(R, 0)), svreinterpret_u64_u32(svget4_u32(R, 2))));
+    T1  = svreinterpret_u32_u64(svtrn1_u64(svreinterpret_u64_u32(svget4_u32(R, 1)), svreinterpret_u64_u32(svget4_u32(R, 3))));
+    T2  = svreinterpret_u32_u64(svtrn2_u64(svreinterpret_u64_u32(svget4_u32(R, 0)), svreinterpret_u64_u32(svget4_u32(R, 2))));
+    T3  = svreinterpret_u32_u64(svtrn2_u64(svreinterpret_u64_u32(svget4_u32(R, 1)), svreinterpret_u64_u32(svget4_u32(R, 3))));
+    R = svset4_u32(R, 0, svtrn1_u32(T0, T1));
+    R = svset4_u32(R, 1, svtrn2_u32(T0, T1));
+    R = svset4_u32(R, 2, svtrn1_u32(T2, T3));
+    R = svset4_u32(R, 3, svtrn2_u32(T2, T3));
+}
+
+
+QUALIFIERS svuint32_t aesni1xm128i(const svuint32_t & in, const uint32x4_t & k0)
+{
+#ifdef __ARM_FEATURE_SVE2_AES
+  auto k = aesni_roundkeys(k0);
+  svuint8_t x = svaesmc_u8(svaese_u8(svreinterpret_u8_u32(in), svdup_neonq_u8(vreinterpretq_u8_u32(k[0]))));
+  x = svaesmc_u8(svaese_u8(x, svdup_neonq_u8(vreinterpretq_u8_u32(k[1]))));
+  x = svaesmc_u8(svaese_u8(x, svdup_neonq_u8(vreinterpretq_u8_u32(k[2]))));
+  x = svaesmc_u8(svaese_u8(x, svdup_neonq_u8(vreinterpretq_u8_u32(k[3]))));
+  x = svaesmc_u8(svaese_u8(x, svdup_neonq_u8(vreinterpretq_u8_u32(k[4]))));
+  x = svaesmc_u8(svaese_u8(x, svdup_neonq_u8(vreinterpretq_u8_u32(k[5]))));
+  x = svaesmc_u8(svaese_u8(x, svdup_neonq_u8(vreinterpretq_u8_u32(k[6]))));
+  x = svaesmc_u8(svaese_u8(x, svdup_neonq_u8(vreinterpretq_u8_u32(k[7]))));
+  x = svaesmc_u8(svaese_u8(x, svdup_neonq_u8(vreinterpretq_u8_u32(k[8]))));
+  x = svaese_u8(x, svdup_neonq_u8(vreinterpretq_u8_u32(k[9])));
+  x = sveor_u8_x(svptrue_b8(), x, svdup_neonq_u8(vreinterpretq_u8_u32(k[10])));
+  return svreinterpret_u32_u8(x);
+#else
+  svuint32_t x;
+  for (int i = 0; i < svcntw(); i += 4)
+  {
+    svbool_t pred = svbic_z(svptrue_b32(), svwhilelt_b32_u32(0, i+4), svwhilelt_b32_u32(0, i));
+    uint32x4_t a = aesni1xm128i(svget_neonq_u32(svcompact_u32(pred, in)), k0);
+    x = svsel_u32(pred, svdup_neonq_u32(a), x);
+  }
+  return x;
+#endif
+}
+
+
+QUALIFIERS void aesni_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
+                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                             svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
+{
+    uint32x4_t k128 = {key0, key1, key2, key3};
+    svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
+    transpose128(ctr);
+    ctr = svset4_u32(ctr, 0, aesni1xm128i(svget4_u32(ctr, 0), k128));
+    ctr = svset4_u32(ctr, 1, aesni1xm128i(svget4_u32(ctr, 1), k128));
+    ctr = svset4_u32(ctr, 2, aesni1xm128i(svget4_u32(ctr, 2), k128));
+    ctr = svset4_u32(ctr, 3, aesni1xm128i(svget4_u32(ctr, 3), k128));
+    transpose128(ctr);
+
+    // convert uint32 to float
+    rnd1 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 0));
+    rnd2 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 1));
+    rnd3 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 2));
+    rnd4 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 3));
+    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+    rnd1 = svmad_f32_x(svptrue_b32(), rnd1, svdup_f32(TWOPOW32_INV_FLOAT), svdup_f32(TWOPOW32_INV_FLOAT/2.0));
+    rnd2 = svmad_f32_x(svptrue_b32(), rnd2, svdup_f32(TWOPOW32_INV_FLOAT), svdup_f32(TWOPOW32_INV_FLOAT/2.0));
+    rnd3 = svmad_f32_x(svptrue_b32(), rnd3, svdup_f32(TWOPOW32_INV_FLOAT), svdup_f32(TWOPOW32_INV_FLOAT/2.0));
+    rnd4 = svmad_f32_x(svptrue_b32(), rnd4, svdup_f32(TWOPOW32_INV_FLOAT), svdup_f32(TWOPOW32_INV_FLOAT/2.0));
+}
+
+
+QUALIFIERS void aesni_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
+{
+    uint32x4_t k128 = {key0, key1, key2, key3};
+    svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
+    transpose128(ctr);
+    ctr = svset4_u32(ctr, 0, aesni1xm128i(svget4_u32(ctr, 0), k128));
+    ctr = svset4_u32(ctr, 1, aesni1xm128i(svget4_u32(ctr, 1), k128));
+    ctr = svset4_u32(ctr, 2, aesni1xm128i(svget4_u32(ctr, 2), k128));
+    ctr = svset4_u32(ctr, 3, aesni1xm128i(svget4_u32(ctr, 3), k128));
+    transpose128(ctr);
+
+    rnd1lo = _uniform_double_hq<false>(svget4_u32(ctr, 0), svget4_u32(ctr, 1));
+    rnd1hi = _uniform_double_hq<true>(svget4_u32(ctr, 0), svget4_u32(ctr, 1));
+    rnd2lo = _uniform_double_hq<false>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
+    rnd2hi = _uniform_double_hq<true>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
+}
+
+QUALIFIERS void aesni_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
+                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                             svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
+{
+    svuint32_t ctr0v = svdup_u32(ctr0);
+    svuint32_t ctr2v = svdup_u32(ctr2);
+    svuint32_t ctr3v = svdup_u32(ctr3);
+
+    aesni_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, rnd2, rnd3, rnd4);
+}
+
+QUALIFIERS void aesni_float4(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
+                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                             svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
+{
+    aesni_float4(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, key2, key3, rnd1, rnd2, rnd3, rnd4);
+}
+
+QUALIFIERS void aesni_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
+{
+    svuint32_t ctr0v = svdup_u32(ctr0);
+    svuint32_t ctr2v = svdup_u32(ctr2);
+    svuint32_t ctr3v = svdup_u32(ctr3);
+
+    aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+}
+
+QUALIFIERS void aesni_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              svfloat64_st & rnd1, svfloat64_st & rnd2)
+{
+    svuint32_t ctr0v = svdup_u32(ctr0);
+    svuint32_t ctr2v = svdup_u32(ctr2);
+    svuint32_t ctr3v = svdup_u32(ctr3);
+
+    svfloat64_st ignore;
+    aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, ignore, rnd2, ignore);
+}
+
+QUALIFIERS void aesni_double2(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              svfloat64_st & rnd1, svfloat64_st & rnd2)
+{
+    aesni_double2(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, key2, key3, rnd1, rnd2);
+}
+#endif
+
+#undef QUALIFIERS
+#undef TWOPOW53_INV_DOUBLE
+#undef TWOPOW32_INV_FLOAT
--- a/src/pystencils/include/arm_neon_helpers.h
+++ b/src/pystencils/include/arm_neon_helpers.h
+/*
+Copyright 2021-2023, Michael Kuron.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
 #if defined(_MSC_VER)
 #define __ARM_NEON
 #endif

+#include <cstddef>
+
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif

--- a/src/pystencils/include/gpu_defines.h
+++ b/src/pystencils/include/gpu_defines.h
+/*
+Copyright 2023, Markus Holzer.
+Copyright 2023, Michael Kuron.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
 #pragma once

 #define POS_INFINITY __int_as_float(0x7f800000)

--- a/src/pystencils/include/half_precision.h
+++ b/src/pystencils/include/half_precision.h
+/*
+Copyright 2023, Markus Holzer.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
 /// Half precision support. Experimental. Use carefully.
 ///
 /// This feature is experimental, since it strictly depends on the underlying architecture and compiler support.

--- a/src/pystencils/include/myintrin.h
+++ b/src/pystencils/include/myintrin.h
+/*
+Copyright 2019-2023, Michael Kuron.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
 #pragma once

 #if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
@@ -13,19 +44,6 @@ QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
    return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
 #endif
 }
-
-QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, __m128i & R3)
-{
-    __m128i T0, T1, T2, T3;
-    T0  = _mm_unpacklo_epi32(R0, R1);
-    T1  = _mm_unpacklo_epi32(R2, R3);
-    T2  = _mm_unpackhi_epi32(R0, R1);
-    T3  = _mm_unpackhi_epi32(R2, R3);
-    R0  = _mm_unpacklo_epi64(T0, T1);
-    R1  = _mm_unpackhi_epi64(T0, T1);
-    R2  = _mm_unpacklo_epi64(T2, T3);
-    R3  = _mm_unpackhi_epi64(T2, T3);
-}
 #endif

 #if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
@@ -110,4 +128,3 @@ QUALIFIERS __m512d _my512_set_m256d(__m256d b, __m256d a)
    return _mm512_insertf64x4(_mm512_castpd256_pd512(a), b, 1);
 }
 #endif
-
--- a/src/pystencils/include/philox_rand.h
+++ b/src/pystencils/include/philox_rand.h
+/*
+Copyright 2010-2011, D. E. Shaw Research. All rights reserved.
+Copyright 2019-2024, Michael Kuron.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
 #if !defined(__OPENCL_VERSION__) && !defined(__HIPCC_RTC__)
 #if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 #include <emmintrin.h> // SSE2
@@ -18,7 +52,7 @@
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
-#ifdef __ARM_FEATURE_SVE
+#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
 #include <arm_sve.h>
 #endif

@@ -38,6 +72,14 @@
 #endif
 #endif

+#if defined(__ARM_FEATURE_SME) && defined(__ARM_FEATURE_SVE)
+#define SVE_QUALIFIERS __arm_streaming_compatible
+#elif defined(__ARM_FEATURE_SME)
+#define SVE_QUALIFIERS __arm_streaming
+#else
+#define SVE_QUALIFIERS
+#endif
+
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 #define QUALIFIERS static __forceinline__ __device__
 #elif defined(__OPENCL_VERSION__)
@@ -69,7 +111,7 @@ typedef std::uint64_t uint64;
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
 typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
 typedef svfloat64_t svfloat64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
-#elif defined(__ARM_FEATURE_SVE)
+#elif defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
 typedef svfloat32_t svfloat32_st;
 typedef svfloat64_t svfloat64_st;
 #endif
@@ -714,8 +756,9 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32
 #endif


-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
 QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key)
+SVE_QUALIFIERS
 {
    svuint32_t lo0 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
    svuint32_t hi0 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
@@ -729,6 +772,7 @@ QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key)
 }

 QUALIFIERS void _philox4x32bumpkey(svuint32x2_t & key)
+SVE_QUALIFIERS
 {
    key = svset2_u32(key, 0, svadd_u32_x(svptrue_b32(), svget2_u32(key, 0), svdup_u32(PHILOX_W32_0)));
    key = svset2_u32(key, 1, svadd_u32_x(svptrue_b32(), svget2_u32(key, 1), svdup_u32(PHILOX_W32_1)));
@@ -736,6 +780,7 @@ QUALIFIERS void _philox4x32bumpkey(svuint32x2_t & key)

 template<bool high>
 QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y)
+SVE_QUALIFIERS
 {
    // convert 32 to 64 bit
    if (high)
@@ -765,6 +810,7 @@ QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y)
 QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
                              uint32 key0, uint32 key1,
                              svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
+SVE_QUALIFIERS
 {
    svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
    svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
@@ -795,6 +841,7 @@ QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2,
 QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
                               uint32 key0, uint32 key1,
                               svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
+SVE_QUALIFIERS
 {
    svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
    svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
@@ -818,6 +865,7 @@ QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2
 QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
                              uint32 key0, uint32 key1,
                              svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
+SVE_QUALIFIERS
 {
    svuint32_t ctr0v = svdup_u32(ctr0);
    svuint32_t ctr2v = svdup_u32(ctr2);
@@ -829,6 +877,7 @@ QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32
 QUALIFIERS void philox_float4(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
                              uint32 key0, uint32 key1,
                              svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
+SVE_QUALIFIERS
 {
    philox_float4(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }
@@ -836,6 +885,7 @@ QUALIFIERS void philox_float4(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 c
 QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
                               uint32 key0, uint32 key1,
                               svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
+SVE_QUALIFIERS
 {
    svuint32_t ctr0v = svdup_u32(ctr0);
    svuint32_t ctr2v = svdup_u32(ctr2);
@@ -847,6 +897,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32
 QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
                               uint32 key0, uint32 key1,
                               svfloat64_st & rnd1, svfloat64_st & rnd2)
+SVE_QUALIFIERS
 {
    svuint32_t ctr0v = svdup_u32(ctr0);
    svuint32_t ctr2v = svdup_u32(ctr2);
@@ -859,6 +910,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32
 QUALIFIERS void philox_double2(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
                               uint32 key0, uint32 key1,
                               svfloat64_st & rnd1, svfloat64_st & rnd2)
+SVE_QUALIFIERS
 {
    philox_double2(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
 }
@@ -869,21 +921,21 @@ QUALIFIERS void philox_double2(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32
 QUALIFIERS void _philox4x32round(vuint32m1_t & ctr0, vuint32m1_t & ctr1, vuint32m1_t & ctr2, vuint32m1_t & ctr3,
                                 vuint32m1_t key0, vuint32m1_t key1)
 {
-    vuint32m1_t lo0 = vmul_vv_u32m1(ctr0, vmv_v_x_u32m1(PHILOX_M4x32_0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
-    vuint32m1_t hi0 = vmulhu_vv_u32m1(ctr0, vmv_v_x_u32m1(PHILOX_M4x32_0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
-    vuint32m1_t lo1 = vmul_vv_u32m1(ctr2, vmv_v_x_u32m1(PHILOX_M4x32_1, vsetvlmax_e32m1()), vsetvlmax_e32m1());
-    vuint32m1_t hi1 = vmulhu_vv_u32m1(ctr2, vmv_v_x_u32m1(PHILOX_M4x32_1, vsetvlmax_e32m1()), vsetvlmax_e32m1());
+    vuint32m1_t lo0 = __riscv_vmul_vv_u32m1(ctr0, __riscv_vmv_v_x_u32m1(PHILOX_M4x32_0, __riscv_vsetvlmax_e32m1()), __riscv_vsetvlmax_e32m1());
+    vuint32m1_t hi0 = __riscv_vmulhu_vv_u32m1(ctr0, __riscv_vmv_v_x_u32m1(PHILOX_M4x32_0, __riscv_vsetvlmax_e32m1()), __riscv_vsetvlmax_e32m1());
+    vuint32m1_t lo1 = __riscv_vmul_vv_u32m1(ctr2, __riscv_vmv_v_x_u32m1(PHILOX_M4x32_1, __riscv_vsetvlmax_e32m1()), __riscv_vsetvlmax_e32m1());
+    vuint32m1_t hi1 = __riscv_vmulhu_vv_u32m1(ctr2, __riscv_vmv_v_x_u32m1(PHILOX_M4x32_1, __riscv_vsetvlmax_e32m1()), __riscv_vsetvlmax_e32m1());

-    ctr0 = vxor_vv_u32m1(vxor_vv_u32m1(hi1, ctr1, vsetvlmax_e32m1()), key0, vsetvlmax_e32m1());
+    ctr0 = __riscv_vxor_vv_u32m1(__riscv_vxor_vv_u32m1(hi1, ctr1, __riscv_vsetvlmax_e32m1()), key0, __riscv_vsetvlmax_e32m1());
    ctr1 = lo1;
-    ctr2 = vxor_vv_u32m1(vxor_vv_u32m1(hi0, ctr3, vsetvlmax_e32m1()), key1, vsetvlmax_e32m1());
+    ctr2 = __riscv_vxor_vv_u32m1(__riscv_vxor_vv_u32m1(hi0, ctr3, __riscv_vsetvlmax_e32m1()), key1, __riscv_vsetvlmax_e32m1());
    ctr3 = lo0;
 }

 QUALIFIERS void _philox4x32bumpkey(vuint32m1_t & key0, vuint32m1_t & key1)
 {
-    key0 = vadd_vv_u32m1(key0, vmv_v_x_u32m1(PHILOX_W32_0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
-    key1 = vadd_vv_u32m1(key1, vmv_v_x_u32m1(PHILOX_W32_1, vsetvlmax_e32m1()), vsetvlmax_e32m1());
+    key0 = __riscv_vadd_vv_u32m1(key0, __riscv_vmv_v_x_u32m1(PHILOX_W32_0, __riscv_vsetvlmax_e32m1()), __riscv_vsetvlmax_e32m1());
+    key1 = __riscv_vadd_vv_u32m1(key1, __riscv_vmv_v_x_u32m1(PHILOX_W32_1, __riscv_vsetvlmax_e32m1()), __riscv_vsetvlmax_e32m1());
 }

 template<bool high>
@@ -892,21 +944,21 @@ QUALIFIERS vfloat64m1_t _uniform_double_hq(vuint32m1_t x, vuint32m1_t y)
    // convert 32 to 64 bit
    if (high)
    {
-        size_t s = vsetvlmax_e32m1();
-        x = vslidedown_vx_u32m1(vundefined_u32m1(), x, s/2, s);
-        y = vslidedown_vx_u32m1(vundefined_u32m1(), y, s/2, s);
+        size_t s = __riscv_vsetvlmax_e32m1();
+        x = __riscv_vslidedown_vx_u32m1(x, s/2, s);
+        y = __riscv_vslidedown_vx_u32m1(y, s/2, s);
    }
-    vuint64m1_t x64 = vwcvtu_x_x_v_u64m1(vlmul_trunc_v_u32m1_u32mf2(x), vsetvlmax_e64m1());
-    vuint64m1_t y64 = vwcvtu_x_x_v_u64m1(vlmul_trunc_v_u32m1_u32mf2(y), vsetvlmax_e64m1());
+    vuint64m1_t x64 = __riscv_vwcvtu_x_x_v_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2(x), __riscv_vsetvlmax_e64m1());
+    vuint64m1_t y64 = __riscv_vwcvtu_x_x_v_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2(y), __riscv_vsetvlmax_e64m1());

    // calculate z = x ^ y << (53 - 32))
-    vuint64m1_t z = vsll_vx_u64m1(y64, 53 - 32, vsetvlmax_e64m1());
-    z = vxor_vv_u64m1(x64, z, vsetvlmax_e64m1());
+    vuint64m1_t z = __riscv_vsll_vx_u64m1(y64, 53 - 32, __riscv_vsetvlmax_e64m1());
+    z = __riscv_vxor_vv_u64m1(x64, z, __riscv_vsetvlmax_e64m1());

    // convert uint64 to double
-    vfloat64m1_t rs = vfcvt_f_xu_v_f64m1(z, vsetvlmax_e64m1());
+    vfloat64m1_t rs = __riscv_vfcvt_f_xu_v_f64m1(z, __riscv_vsetvlmax_e64m1());
    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
-    rs = vfmadd_vv_f64m1(rs, vfmv_v_f_f64m1(TWOPOW53_INV_DOUBLE, vsetvlmax_e64m1()), vfmv_v_f_f64m1(TWOPOW53_INV_DOUBLE/2.0, vsetvlmax_e64m1()), vsetvlmax_e64m1());
+    rs = __riscv_vfmadd_vv_f64m1(rs, __riscv_vfmv_v_f_f64m1(TWOPOW53_INV_DOUBLE, __riscv_vsetvlmax_e64m1()), __riscv_vfmv_v_f_f64m1(TWOPOW53_INV_DOUBLE/2.0, __riscv_vsetvlmax_e64m1()), __riscv_vsetvlmax_e64m1());

    return rs;
 }
@@ -916,8 +968,8 @@ QUALIFIERS void philox_float4(vuint32m1_t ctr0, vuint32m1_t ctr1, vuint32m1_t ct
                              uint32 key0, uint32 key1,
                              vfloat32m1_t & rnd1, vfloat32m1_t & rnd2, vfloat32m1_t & rnd3, vfloat32m1_t & rnd4)
 {
-    vuint32m1_t key0v = vmv_v_x_u32m1(key0, vsetvlmax_e32m1());
-    vuint32m1_t key1v = vmv_v_x_u32m1(key1, vsetvlmax_e32m1());
+    vuint32m1_t key0v = __riscv_vmv_v_x_u32m1(key0, __riscv_vsetvlmax_e32m1());
+    vuint32m1_t key1v = __riscv_vmv_v_x_u32m1(key1, __riscv_vsetvlmax_e32m1());
    _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);                                  // 1
    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 2
    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 3
@@ -930,15 +982,15 @@ QUALIFIERS void philox_float4(vuint32m1_t ctr0, vuint32m1_t ctr1, vuint32m1_t ct
    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 10

    // convert uint32 to float
-    rnd1 = vfcvt_f_xu_v_f32m1(ctr0, vsetvlmax_e32m1());
-    rnd2 = vfcvt_f_xu_v_f32m1(ctr1, vsetvlmax_e32m1());
-    rnd3 = vfcvt_f_xu_v_f32m1(ctr2, vsetvlmax_e32m1());
-    rnd4 = vfcvt_f_xu_v_f32m1(ctr3, vsetvlmax_e32m1());
+    rnd1 = __riscv_vfcvt_f_xu_v_f32m1(ctr0, __riscv_vsetvlmax_e32m1());
+    rnd2 = __riscv_vfcvt_f_xu_v_f32m1(ctr1, __riscv_vsetvlmax_e32m1());
+    rnd3 = __riscv_vfcvt_f_xu_v_f32m1(ctr2, __riscv_vsetvlmax_e32m1());
+    rnd4 = __riscv_vfcvt_f_xu_v_f32m1(ctr3, __riscv_vsetvlmax_e32m1());
    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
-    rnd1 = vfmadd_vv_f32m1(rnd1, vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT, vsetvlmax_e32m1()), vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT/2.0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
-    rnd2 = vfmadd_vv_f32m1(rnd2, vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT, vsetvlmax_e32m1()), vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT/2.0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
-    rnd3 = vfmadd_vv_f32m1(rnd3, vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT, vsetvlmax_e32m1()), vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT/2.0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
-    rnd4 = vfmadd_vv_f32m1(rnd4, vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT, vsetvlmax_e32m1()), vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT/2.0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
+    rnd1 = __riscv_vfmadd_vv_f32m1(rnd1, __riscv_vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT, __riscv_vsetvlmax_e32m1()), __riscv_vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT/2.0, __riscv_vsetvlmax_e32m1()), __riscv_vsetvlmax_e32m1());
+    rnd2 = __riscv_vfmadd_vv_f32m1(rnd2, __riscv_vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT, __riscv_vsetvlmax_e32m1()), __riscv_vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT/2.0, __riscv_vsetvlmax_e32m1()), __riscv_vsetvlmax_e32m1());
+    rnd3 = __riscv_vfmadd_vv_f32m1(rnd3, __riscv_vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT, __riscv_vsetvlmax_e32m1()), __riscv_vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT/2.0, __riscv_vsetvlmax_e32m1()), __riscv_vsetvlmax_e32m1());
+    rnd4 = __riscv_vfmadd_vv_f32m1(rnd4, __riscv_vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT, __riscv_vsetvlmax_e32m1()), __riscv_vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT/2.0, __riscv_vsetvlmax_e32m1()), __riscv_vsetvlmax_e32m1());
 }


@@ -946,8 +998,8 @@ QUALIFIERS void philox_double2(vuint32m1_t ctr0, vuint32m1_t ctr1, vuint32m1_t c
                               uint32 key0, uint32 key1,
                               vfloat64m1_t & rnd1lo, vfloat64m1_t & rnd1hi, vfloat64m1_t & rnd2lo, vfloat64m1_t & rnd2hi)
 {
-    vuint32m1_t key0v = vmv_v_x_u32m1(key0, vsetvlmax_e32m1());
-    vuint32m1_t key1v = vmv_v_x_u32m1(key1, vsetvlmax_e32m1());
+    vuint32m1_t key0v = __riscv_vmv_v_x_u32m1(key0, __riscv_vsetvlmax_e32m1());
+    vuint32m1_t key1v = __riscv_vmv_v_x_u32m1(key1, __riscv_vsetvlmax_e32m1());
    _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);                                  // 1
    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 2
    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 3
@@ -969,9 +1021,9 @@ QUALIFIERS void philox_float4(uint32 ctr0, vuint32m1_t ctr1, uint32 ctr2, uint32
                              uint32 key0, uint32 key1,
                              vfloat32m1_t & rnd1, vfloat32m1_t & rnd2, vfloat32m1_t & rnd3, vfloat32m1_t & rnd4)
 {
-    vuint32m1_t ctr0v = vmv_v_x_u32m1(ctr0, vsetvlmax_e32m1());
-    vuint32m1_t ctr2v = vmv_v_x_u32m1(ctr2, vsetvlmax_e32m1());
-    vuint32m1_t ctr3v = vmv_v_x_u32m1(ctr3, vsetvlmax_e32m1());
+    vuint32m1_t ctr0v = __riscv_vmv_v_x_u32m1(ctr0, __riscv_vsetvlmax_e32m1());
+    vuint32m1_t ctr2v = __riscv_vmv_v_x_u32m1(ctr2, __riscv_vsetvlmax_e32m1());
+    vuint32m1_t ctr3v = __riscv_vmv_v_x_u32m1(ctr3, __riscv_vsetvlmax_e32m1());

    philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }
@@ -980,16 +1032,16 @@ QUALIFIERS void philox_float4(uint32 ctr0, vint32m1_t ctr1, uint32 ctr2, uint32
                              uint32 key0, uint32 key1,
                              vfloat32m1_t & rnd1, vfloat32m1_t & rnd2, vfloat32m1_t & rnd3, vfloat32m1_t & rnd4)
 {
-    philox_float4(ctr0, vreinterpret_v_i32m1_u32m1(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
+    philox_float4(ctr0, __riscv_vreinterpret_v_i32m1_u32m1(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }

 QUALIFIERS void philox_double2(uint32 ctr0, vuint32m1_t ctr1, uint32 ctr2, uint32 ctr3,
                               uint32 key0, uint32 key1,
                               vfloat64m1_t & rnd1lo, vfloat64m1_t & rnd1hi, vfloat64m1_t & rnd2lo, vfloat64m1_t & rnd2hi)
 {
-    vuint32m1_t ctr0v = vmv_v_x_u32m1(ctr0, vsetvlmax_e32m1());
-    vuint32m1_t ctr2v = vmv_v_x_u32m1(ctr2, vsetvlmax_e32m1());
-    vuint32m1_t ctr3v = vmv_v_x_u32m1(ctr3, vsetvlmax_e32m1());
+    vuint32m1_t ctr0v = __riscv_vmv_v_x_u32m1(ctr0, __riscv_vsetvlmax_e32m1());
+    vuint32m1_t ctr2v = __riscv_vmv_v_x_u32m1(ctr2, __riscv_vsetvlmax_e32m1());
+    vuint32m1_t ctr3v = __riscv_vmv_v_x_u32m1(ctr3, __riscv_vsetvlmax_e32m1());

    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
 }
@@ -998,9 +1050,9 @@ QUALIFIERS void philox_double2(uint32 ctr0, vuint32m1_t ctr1, uint32 ctr2, uint3
                               uint32 key0, uint32 key1,
                               vfloat64m1_t & rnd1, vfloat64m1_t & rnd2)
 {
-    vuint32m1_t ctr0v = vmv_v_x_u32m1(ctr0, vsetvlmax_e32m1());
-    vuint32m1_t ctr2v = vmv_v_x_u32m1(ctr2, vsetvlmax_e32m1());
-    vuint32m1_t ctr3v = vmv_v_x_u32m1(ctr3, vsetvlmax_e32m1());
+    vuint32m1_t ctr0v = __riscv_vmv_v_x_u32m1(ctr0, __riscv_vsetvlmax_e32m1());
+    vuint32m1_t ctr2v = __riscv_vmv_v_x_u32m1(ctr2, __riscv_vsetvlmax_e32m1());
+    vuint32m1_t ctr3v = __riscv_vmv_v_x_u32m1(ctr3, __riscv_vsetvlmax_e32m1());

    vfloat64m1_t ignore;
    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
@@ -1010,7 +1062,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, vint32m1_t ctr1, uint32 ctr2, uint32
                               uint32 key0, uint32 key1,
                               vfloat64m1_t & rnd1, vfloat64m1_t & rnd2)
 {
-    philox_double2(ctr0, vreinterpret_v_i32m1_u32m1(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
+    philox_double2(ctr0, __riscv_vreinterpret_v_i32m1_u32m1(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
 }
 #endif

@@ -1338,3 +1390,11 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ct
 #endif
 #endif

+#undef QUALIFIERS
+#undef SVE_QUALIFIERS
+#undef PHILOX_W32_0
+#undef PHILOX_W32_1
+#undef PHILOX_M4x32_0
+#undef PHILOX_M4x32_1
+#undef TWOPOW53_INV_DOUBLE
+#undef TWOPOW32_INV_FLOAT
--- a/src/pystencils/include/ppc_altivec_helpers.h
+++ b/src/pystencils/include/ppc_altivec_helpers.h
+/*
+Copyright 2021, Michael Kuron.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
 #include <altivec.h>
 #undef vector
 #undef bool

--- a/src/pystencils/include/riscv_v_helpers.h
+++ b/src/pystencils/include/riscv_v_helpers.h
+/*
+Copyright 2023, Michael Kuron.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+inline void cachelineZero(void * p) {
+#ifdef __riscv_zicboz
+	__asm__ volatile("cbo.zero (%0)"::"r"(p):"memory");
+#endif
+}
+
+inline size_t _cachelineSize() {
+	// allocate and fill with ones
+	const size_t max_size = 0x100000;
+	uint8_t data[2*max_size];
+	for (size_t i = 0; i < 2*max_size; ++i) {
+		data[i] = 0xff;
+	}
+	
+	// find alignment offset
+	size_t offset = max_size - ((uintptr_t) data) % max_size;
+
+	// zero a cacheline
+	cachelineZero((void*) (data + offset));
+
+	// make sure that at least one byte was zeroed
+	if (data[offset] != 0) {
+		return SIZE_MAX;
+	}
+
+	// make sure that nothing was zeroed before the pointer
+	if (data[offset-1] == 0) {
+		return SIZE_MAX;
+	}
+
+	// find the last byte that was zeroed
+	for (size_t size = 1; size < max_size; ++size) {
+		if (data[offset + size] != 0) {
+			return size;
+		}
+	}
+	
+	// too much was zeroed
+	return SIZE_MAX;
+}
+
+inline size_t cachelineSize() {
+#ifdef __riscv_zicboz
+	static size_t size = _cachelineSize();
+	return size;
+#else
+	return SIZE_MAX;
+#endif
+}
--- a/src/pystencils/kernel_contrains_check.py
+++ b/src/pystencils/kernel_contrains_check.py
@@ -38,6 +38,7 @@ class KernelConstraintsCheck:

    def __init__(self, check_independence_condition=True, check_double_write_condition=True):
        self.scopes = NestedScopes()
+        self.field_reads = defaultdict(set)
        self.field_writes = defaultdict(set)
        self.fields_read = set()
        self.check_independence_condition = check_independence_condition
@@ -111,6 +112,13 @@ class KernelConstraintsCheck:
            if self.check_double_write_condition and len(self.field_writes[fai]) > 1:
                raise ValueError(
                    f"Field {lhs.field.name} is written at two different locations")
+
+            if fai in self.field_reads:
+                reads = tuple(self.field_reads[fai])
+                if len(reads) > 1 or lhs.offsets != reads[0]:
+                    if self.check_independence_condition:
+                        raise ValueError(f"Field {lhs.field.name} is written at different location than it was read. "
+                                         f"This means the resulting kernel would not be thread safe")
        elif isinstance(lhs, sp.Symbol):
            if self.scopes.is_defined_locally(lhs):
                raise ValueError(f"Assignments not in SSA form, multiple assignments to {lhs.name}")
@@ -120,8 +128,9 @@ class KernelConstraintsCheck:

    def update_accesses_rhs(self, rhs):
        if isinstance(rhs, Field.Access) and self.check_independence_condition:
-            writes = self.field_writes[self.FieldAndIndex(
-                rhs.field, rhs.index)]
+            fai = self.FieldAndIndex(rhs.field, rhs.index)
+            writes = self.field_writes[fai]
+            self.field_reads[fai].add(rhs.offsets)
            for write_offset in writes:
                assert len(writes) == 1
                if write_offset != rhs.offsets:

--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -128,6 +128,7 @@ def create_domain_kernel(assignments: NodeCollection, *, config: CreateKernelCon
    # --- check constrains
    check = KernelConstraintsCheck(check_independence_condition=not config.skip_independence_check,
                                   check_double_write_condition=not config.allow_double_writes)
+
    check.visit(assignments)

    assignments.bound_fields = check.fields_written

--- a/src/pystencils/simp/simplifications.py
+++ b/src/pystencils/simp/simplifications.py
@@ -190,7 +190,7 @@ def add_subexpressions_for_field_reads(ac, subexpressions=True, main_assignments
            field_reads.update(assignment.rhs.atoms(Field.Access))

    if not field_reads:
-        return
+        return ac

    substitutions = dict()
    for fa in field_reads:

--- a/src/pystencils/transformations.py
+++ b/src/pystencils/transformations.py
@@ -276,8 +276,10 @@ def add_outer_loop_over_indexed_elements(loop_node: ast.Block) -> ast.Block:
    if len(indexed_elements) == 0:
        return loop_node
    reference_element = get_common_indexed_element(indexed_elements)
+    index = reference_element.indices[0].atoms(TypedSymbol)
+    assert len(index) == 1, "index expressions must only contain one symbol representing the index"
    new_loop = ast.LoopOverCoordinate(loop_node, 0, 0,
-                                      reference_element.shape[0], 1, custom_loop_ctr=reference_element.indices[0])
+                                      reference_element.shape[0], 1, custom_loop_ctr=index.pop())
    return ast.Block([new_loop])



--- a/src/pystencils/typing/__init__.py
+++ b/src/pystencils/typing/__init__.py
@@ -3,14 +3,14 @@ from pystencils.typing.cast_functions import (CastFunc, BooleanCastFunc, VectorM
 from pystencils.typing.types import (is_supported_type, numpy_name_to_c, AbstractType, BasicType, VectorType,
                                     PointerType, StructType, create_type)
 from pystencils.typing.typed_sympy import (assumptions_from_dtype, TypedSymbol, FieldStrideSymbol, FieldShapeSymbol,
-                                           FieldPointerSymbol)
+                                           FieldPointerSymbol, CFunction)
 from pystencils.typing.utilities import (typed_symbols, get_base_type, result_type, collate_types,
                                         get_type_of_expression, get_next_parent_of_type, parents_of_type)


 __all__ = ['CastFunc', 'BooleanCastFunc', 'VectorMemoryAccess', 'ReinterpretCastFunc', 'PointerArithmeticFunc',
           'is_supported_type', 'numpy_name_to_c', 'AbstractType', 'BasicType',
-           'VectorType', 'PointerType', 'StructType', 'create_type',
-           'assumptions_from_dtype', 'TypedSymbol', 'FieldStrideSymbol', 'FieldShapeSymbol', 'FieldPointerSymbol',
+           'VectorType', 'PointerType', 'StructType', 'create_type', 'assumptions_from_dtype',
+           'TypedSymbol', 'FieldStrideSymbol', 'FieldShapeSymbol', 'FieldPointerSymbol', 'CFunction',
           'typed_symbols', 'get_base_type', 'result_type', 'collate_types',
           'get_type_of_expression', 'get_next_parent_of_type', 'parents_of_type']
--- a/src/pystencils/typing/leaf_typing.py
+++ b/src/pystencils/typing/leaf_typing.py
@@ -11,6 +11,7 @@ from sympy.core.relational import Relational
 from sympy.functions.elementary.piecewise import ExprCondPair
 from sympy.functions.elementary.trigonometric import TrigonometricFunction, InverseTrigonometricFunction
 from sympy.functions.elementary.hyperbolic import HyperbolicFunction
+from sympy.functions.elementary.integers import RoundFunction
 from sympy.logic.boolalg import BooleanFunction
 from sympy.logic.boolalg import BooleanAtom

@@ -213,7 +214,7 @@ class TypeAdder:
                    new_args.append(a)
            return expr.func(*new_args) if new_args else expr, collated_type
        elif isinstance(expr, (sp.Pow, sp.exp, InverseTrigonometricFunction, TrigonometricFunction,
-                               HyperbolicFunction, sp.log)):
+                               HyperbolicFunction, sp.log, RoundFunction)):
            args_types = [self.figure_out_type(arg) for arg in expr.args]
            collated_type = collate_types([t for _, t in args_types])
            new_args = [a if t.dtype_eq(collated_type) else CastFunc(a, collated_type) for a, t in args_types]

--- a/src/pystencils/typing/typed_sympy.py
+++ b/src/pystencils/typing/typed_sympy.py
@@ -178,3 +178,20 @@ class FieldPointerSymbol(TypedSymbol):

    __xnew__ = staticmethod(__new_stage2__)
    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
+
+
+class CFunction(TypedSymbol):
+    def __new__(cls, function, dtype):
+        return CFunction.__xnew_cached_(cls, function, dtype)
+
+    def __new_stage2__(cls, function, dtype):
+        return super(CFunction, cls).__xnew__(cls, function, dtype)
+
+    __xnew__ = staticmethod(__new_stage2__)
+    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
+
+    def __getnewargs__(self):
+        return self.name, self.dtype
+
+    def __getnewargs_ex__(self):
+        return (self.name, self.dtype), {}
--- a/src/pystencils/typing/types.py
+++ b/src/pystencils/typing/types.py
@@ -7,7 +7,7 @@ import sympy as sp

 def is_supported_type(dtype: np.dtype):
    scalar = dtype.type
-    c = np.issctype(dtype)
+    c = np.issubdtype(dtype, np.generic)
    subclass = issubclass(scalar, np.floating) or issubclass(scalar, np.integer) or issubclass(scalar, np.bool_)
    additional_checks = dtype.fields is None and dtype.hasobject is False and dtype.subdtype is None
    return c and subclass and additional_checks

--- a/src/pystencils/utils.py
+++ b/src/pystencils/utils.py
@@ -82,8 +82,8 @@ def boolean_array_bounding_box(boolean_array):

    >>> a = np.zeros((4, 4), dtype=bool)
    >>> a[1:-1, 1:-1] = True
-    >>> boolean_array_bounding_box(a)
-    [(1, 3), (1, 3)]
+    >>> boolean_array_bounding_box(a) == [(1, 3), (1, 3)]
+    True
    """
    dim = boolean_array.ndim
    shape = boolean_array.shape

--- a/tests/test_conditional_vec.py
+++ b/tests/test_conditional_vec.py
@@ -3,6 +3,7 @@ import sympy as sp
 import pytest

 import pystencils as ps
+from pystencils.alignedarray import aligned_zeros
 from pystencils.astnodes import Block, Conditional, SympyAssignment
 from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
 from pystencils.enums import Target
@@ -15,7 +16,7 @@ supported_instruction_sets = get_supported_instruction_sets() if get_supported_i
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
 @pytest.mark.parametrize('dtype', ('float32', 'float64'))
 def test_vec_any(instruction_set, dtype):
-    if instruction_set in ['sve', 'rvv']:
+    if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
        width = 4  # we don't know the actual value
    else:
        width = get_vector_instruction_set(dtype, instruction_set)['width']
@@ -34,7 +35,7 @@ def test_vec_any(instruction_set, dtype):
                           cpu_vectorize_info={'instruction_set': instruction_set})
    kernel = ast.compile()
    kernel(data=data_arr)
-    if instruction_set in ['sve', 'rvv']:
+    if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
        # we only know that the first value has changed
        np.testing.assert_equal(data_arr[3:9, :3 * width - 1], 2.0)
    else:
@@ -44,7 +45,7 @@ def test_vec_any(instruction_set, dtype):
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
 @pytest.mark.parametrize('dtype', ('float32', 'float64'))
 def test_vec_all(instruction_set, dtype):
-    if instruction_set in ['sve', 'rvv']:
+    if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
        width = 1000  # we don't know the actual value, need something guaranteed larger than vector
    else:
        width = get_vector_instruction_set(dtype, instruction_set)['width']
@@ -59,7 +60,7 @@ def test_vec_all(instruction_set, dtype):
                           cpu_vectorize_info={'instruction_set': instruction_set})
    kernel = ast.compile()
    kernel(data=data_arr)
-    if instruction_set in ['sve', 'rvv']:
+    if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
        # we only know that some values in the middle have been replaced
        assert np.all(data_arr[3:9, :2] <= 1.0)
        assert np.any(data_arr[3:9, 2:] == 2.0)
@@ -94,16 +95,60 @@ def test_boolean_before_loop():

 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
 @pytest.mark.parametrize('dtype', ('float32', 'float64'))
-def test_vec_maskstore(instruction_set, dtype):
-    data_arr = np.zeros((16, 16), dtype=dtype)
+@pytest.mark.parametrize('nontemporal', [False, True])
+@pytest.mark.parametrize('aligned', [False, True])
+def test_vec_maskstore(instruction_set, dtype, nontemporal, aligned):
+    data_arr = (aligned_zeros if aligned else np.zeros)((16, 16), dtype=dtype)
    data_arr[3:-3, 3:-3] = 1.0
    data = ps.fields(f"data: {dtype}[2D]", data=data_arr)

    c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))]

    assignmets = NodeCollection(c)
-    config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set}, default_number_float=dtype)
+    config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set,
+                                                       'nontemporal': nontemporal,
+                                                       'assume_aligned': aligned},
+                                   default_number_float=dtype)
    ast = ps.create_kernel(assignmets, config=config)
+    if 'maskStore' in ast.instruction_set:
+        instruction = 'maskStream' if nontemporal and 'maskStream' in ast.instruction_set else (
+                      'maskStoreA' if aligned and 'maskStoreA' in ast.instruction_set else 'maskStore')
+        assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
+    print(ps.get_code_str(ast))
+    kernel = ast.compile()
+    kernel(data=data_arr)
+    np.testing.assert_equal(data_arr[:3, :], 2.0)
+    np.testing.assert_equal(data_arr[-3:, :], 2.0)
+    np.testing.assert_equal(data_arr[:, :3], 2.0)
+    np.testing.assert_equal(data_arr[:, -3:], 2.0)
+    np.testing.assert_equal(data_arr[3:-3, 3:-3], 1.0)
+
+
+@pytest.mark.parametrize('instruction_set', supported_instruction_sets)
+@pytest.mark.parametrize('dtype', ('float32', 'float64'))
+@pytest.mark.parametrize('nontemporal', [False, True])
+def test_vec_maskscatter(instruction_set, dtype, nontemporal):
+    data_arr = np.zeros((16, 16), dtype=dtype)
+    data_arr[3:-3, 3:-3] = 1.0
+    data = ps.fields(f"data: {dtype}[2D]")
+
+    c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))]
+
+    assignmets = NodeCollection(c)
+    config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set,
+                                                       'nontemporal': nontemporal},
+                                   default_number_float=dtype)
+    if 'maskStoreS' not in get_vector_instruction_set(dtype, instruction_set) \
+            and not instruction_set.startswith('sve'):
+        with pytest.warns(UserWarning) as warn:
+            ast = ps.create_kernel(assignmets, config=config)
+            assert 'Could not vectorize loop' in warn[0].message.args[0]
+    else:
+        with pytest.warns(None) as warn:
+            ast = ps.create_kernel(assignmets, config=config)
+            assert len(warn) == 0
+        instruction = 'maskStreamS' if nontemporal and 'maskStreamS' in ast.instruction_set else 'maskStoreS'
+        assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
    print(ps.get_code_str(ast))
    kernel = ast.compile()
    kernel(data=data_arr)

--- a/tests/test_field.py
+++ b/tests/test_field.py
@@ -5,21 +5,33 @@ import sympy as sp
 import pystencils as ps
 from pystencils import TypedSymbol
 from pystencils.typing import create_type
-from pystencils.field import Field, FieldType, layout_string_to_tuple
+from pystencils.field import Field, FieldType, layout_string_to_tuple, spatial_layout_string_to_tuple


 def test_field_basic():
-    f = Field.create_generic('f', spatial_dimensions=2)
+    f = Field.create_generic("f", spatial_dimensions=2)
    assert FieldType.is_generic(f)
-    assert f['E'] == f[1, 0]
-    assert f['N'] == f[0, 1]
-    assert '_' in f.center._latex('dummy')
-
-    assert f.index_to_physical(index_coordinates=sp.Matrix([0, 0]), staggered=False)[0] == 0
-    assert f.index_to_physical(index_coordinates=sp.Matrix([0, 0]), staggered=False)[1] == 0
-
-    assert f.physical_to_index(physical_coordinates=sp.Matrix([0, 0]), staggered=False)[0] == 0
-    assert f.physical_to_index(physical_coordinates=sp.Matrix([0, 0]), staggered=False)[1] == 0
+    assert f["E"] == f[1, 0]
+    assert f["N"] == f[0, 1]
+    assert "_" in f.center._latex("dummy")
+
+    assert (
+        f.index_to_physical(index_coordinates=sp.Matrix([0, 0]), staggered=False)[0]
+        == 0
+    )
+    assert (
+        f.index_to_physical(index_coordinates=sp.Matrix([0, 0]), staggered=False)[1]
+        == 0
+    )
+
+    assert (
+        f.physical_to_index(physical_coordinates=sp.Matrix([0, 0]), staggered=False)[0]
+        == 0
+    )
+    assert (
+        f.physical_to_index(physical_coordinates=sp.Matrix([0, 0]), staggered=False)[1]
+        == 0
+    )

    f1 = f.new_field_with_different_name("f1")
    assert f1.ndim == f.ndim
@@ -28,7 +40,7 @@ def test_field_basic():
    fixed = ps.fields("f(5, 5) : double[20, 20]")
    assert fixed.neighbor_vector((1, 1)).shape == (5, 5)

-    f = Field.create_fixed_size('f', (10, 10), strides=(80, 8), dtype=np.float64)
+    f = Field.create_fixed_size("f", (10, 10), strides=(80, 8), dtype=np.float64)
    assert f.spatial_strides == (10, 1)
    assert f.index_strides == ()
    assert f.center_vector == sp.Matrix([f.center])
@@ -37,20 +49,21 @@ def test_field_basic():
    assert f1.ndim == f.ndim
    assert f1.values_per_cell() == f.values_per_cell()

-    f = Field.create_fixed_size('f', (8, 8, 2, 2), index_dimensions=2)
-    assert f.center_vector == sp.Matrix([[f(0, 0), f(0, 1)],
-                                         [f(1, 0), f(1, 1)]])
+    f = Field.create_fixed_size("f", (8, 8, 2, 2), index_dimensions=2)
+    assert f.center_vector == sp.Matrix([[f(0, 0), f(0, 1)], [f(1, 0), f(1, 1)]])
    field_access = f[1, 1]
    assert field_access.nr_of_coordinates == 2
-    assert field_access.offset_name == 'NE'
+    assert field_access.offset_name == "NE"
    neighbor = field_access.neighbor(coord_id=0, offset=-2)
    assert neighbor.offsets == (-1, 1)
-    assert '_' in neighbor._latex('dummy')
+    assert "_" in neighbor._latex("dummy")

-    f = Field.create_fixed_size('f', (8, 8, 2, 2, 2), index_dimensions=3)
-    assert f.center_vector == sp.Array([[[f(i, j, k) for k in range(2)] for j in range(2)] for i in range(2)])
+    f = Field.create_fixed_size("f", (8, 8, 2, 2, 2), index_dimensions=3)
+    assert f.center_vector == sp.Array(
+        [[[f(i, j, k) for k in range(2)] for j in range(2)] for i in range(2)]
+    )

-    f = Field.create_generic('f', spatial_dimensions=5, index_dimensions=2)
+    f = Field.create_generic("f", spatial_dimensions=5, index_dimensions=2)
    field_access = f[1, -1, 2, -3, 0](1, 0)
    assert field_access.offsets == (1, -1, 2, -3, 0)
    assert field_access.index == (1, 0)
@@ -60,61 +73,71 @@ def test_error_handling():
    struct_dtype = np.dtype([('a', np.int32), ('b', np.float64), ('c', np.uint32)])
    Field.create_generic('f', spatial_dimensions=2, index_dimensions=0, dtype=struct_dtype)
    with pytest.raises(ValueError) as e:
-        Field.create_generic('f', spatial_dimensions=2, index_dimensions=1, dtype=struct_dtype)
-    assert 'index dimension' in str(e.value)
+        Field.create_generic(
+            "f", spatial_dimensions=2, index_dimensions=1, dtype=struct_dtype
+        )
+    assert "index dimension" in str(e.value)

-    arr = np.array([[[(1,)*3, (2,)*3, (3,)*3]]*2], dtype=struct_dtype)
-    Field.create_from_numpy_array('f', arr, index_dimensions=0)
+    arr = np.array([[[(1,) * 3, (2,) * 3, (3,) * 3]] * 2], dtype=struct_dtype)
+    Field.create_from_numpy_array("f", arr, index_dimensions=0)
    with pytest.raises(ValueError) as e:
-        Field.create_from_numpy_array('f', arr, index_dimensions=1)
-    assert 'Structured arrays' in str(e.value)
+        Field.create_from_numpy_array("f", arr, index_dimensions=1)
+    assert "Structured arrays" in str(e.value)

    arr = np.zeros([3, 3, 3])
-    Field.create_from_numpy_array('f', arr, index_dimensions=2)
+    Field.create_from_numpy_array("f", arr, index_dimensions=2)
    with pytest.raises(ValueError) as e:
-        Field.create_from_numpy_array('f', arr, index_dimensions=3)
-    assert 'Too many' in str(e.value)
+        Field.create_from_numpy_array("f", arr, index_dimensions=3)
+    assert "Too many" in str(e.value)

-    Field.create_fixed_size('f', (3, 2, 4), index_dimensions=0, dtype=struct_dtype, layout='reverse_numpy')
+    Field.create_fixed_size(
+        "f", (3, 2, 4), index_dimensions=0, dtype=struct_dtype, layout="reverse_numpy"
+    )
    with pytest.raises(ValueError) as e:
-        Field.create_fixed_size('f', (3, 2, 4), index_dimensions=1, dtype=struct_dtype, layout='reverse_numpy')
-    assert 'Structured arrays' in str(e.value)
-
-    f = Field.create_fixed_size('f', (10, 10))
+        Field.create_fixed_size(
+            "f",
+            (3, 2, 4),
+            index_dimensions=1,
+            dtype=struct_dtype,
+            layout="reverse_numpy",
+        )
+    assert "Structured arrays" in str(e.value)
+
+    f = Field.create_fixed_size("f", (10, 10))
    with pytest.raises(ValueError) as e:
        f[1]
-    assert 'Wrong number of spatial indices' in str(e.value)
+    assert "Wrong number of spatial indices" in str(e.value)

-    f = Field.create_generic('f', spatial_dimensions=2, index_shape=(3,))
+    f = Field.create_generic("f", spatial_dimensions=2, index_shape=(3,))
    with pytest.raises(ValueError) as e:
        f(3)
-    assert 'out of bounds' in str(e.value)
+    assert "out of bounds" in str(e.value)

-    f = Field.create_fixed_size('f', (10, 10, 3, 4), index_dimensions=2)
+    f = Field.create_fixed_size("f", (10, 10, 3, 4), index_dimensions=2)
    with pytest.raises(ValueError) as e:
        f(3, 0)
-    assert 'out of bounds' in str(e.value)
+    assert "out of bounds" in str(e.value)

    with pytest.raises(ValueError) as e:
        f(1, 0)(1, 0)
-    assert 'Indexing an already indexed' in str(e.value)
+    assert "Indexing an already indexed" in str(e.value)

    with pytest.raises(ValueError) as e:
        f(1)
-    assert 'Wrong number of indices' in str(e.value)
+    assert "Wrong number of indices" in str(e.value)

    with pytest.raises(ValueError) as e:
-        Field.create_generic('f', spatial_dimensions=2, layout='wrong')
-    assert 'Unknown layout descriptor' in str(e.value)
+        Field.create_generic("f", spatial_dimensions=2, layout="wrong")
+    assert "Unknown layout descriptor" in str(e.value)

-    assert layout_string_to_tuple('fzyx', dim=4) == (3, 2, 1, 0)
+    assert layout_string_to_tuple("fzyx", dim=4) == (3, 2, 1, 0)
    with pytest.raises(ValueError) as e:
-        layout_string_to_tuple('wrong', dim=4)
-    assert 'Unknown layout descriptor' in str(e.value)
+        layout_string_to_tuple("wrong", dim=4)
+    assert "Unknown layout descriptor" in str(e.value)


 def test_decorator_scoping():
-    dst = ps.fields('dst : double[2D]')
+    dst = ps.fields("dst : double[2D]")

    def f1():
        a = sp.Symbol("a")
@@ -134,7 +157,7 @@ def test_decorator_scoping():


 def test_string_creation():
-    x, y, z = ps.fields('  x(4),    y(3,5) z : double[  3,  47]')
+    x, y, z = ps.fields("  x(4),    y(3,5) z : double[  3,  47]")
    assert x.index_shape == (4,)
    assert y.index_shape == (3, 5)
    assert z.spatial_shape == (3, 47)
@@ -142,19 +165,85 @@ def test_string_creation():

 def test_itemsize():

-    x = ps.fields('x: float32[1d]')
-    y = ps.fields('y:  float64[2d]')
-    i = ps.fields('i:  int16[1d]')
+    x = ps.fields("x: float32[1d]")
+    y = ps.fields("y:  float64[2d]")
+    i = ps.fields("i:  int16[1d]")

    assert x.itemsize == 4
    assert y.itemsize == 8
    assert i.itemsize == 2


+def test_spatial_memory_layout_descriptors():
+    assert (
+        spatial_layout_string_to_tuple("AoS", 3)
+        == spatial_layout_string_to_tuple("aos", 3)
+        == spatial_layout_string_to_tuple("ZYXF", 3)
+        == spatial_layout_string_to_tuple("zyxf", 3)
+        == (2, 1, 0)
+    )
+    assert (
+        spatial_layout_string_to_tuple("SoA", 3)
+        == spatial_layout_string_to_tuple("soa", 3)
+        == spatial_layout_string_to_tuple("FZYX", 3)
+        == spatial_layout_string_to_tuple("fzyx", 3)
+        == spatial_layout_string_to_tuple("f", 3)
+        == spatial_layout_string_to_tuple("F", 3)
+        == (2, 1, 0)
+    )
+    assert (
+        spatial_layout_string_to_tuple("c", 3)
+        == spatial_layout_string_to_tuple("C", 3)
+        == (0, 1, 2)
+    )
+
+    assert spatial_layout_string_to_tuple("C", 5) == (0, 1, 2, 3, 4)
+
+    with pytest.raises(ValueError):
+        spatial_layout_string_to_tuple("aos", -1)
+
+    with pytest.raises(ValueError):
+        spatial_layout_string_to_tuple("aos", 4)
+
+
+def test_memory_layout_descriptors():
+    assert (
+        layout_string_to_tuple("AoS", 4)
+        == layout_string_to_tuple("aos", 4)
+        == layout_string_to_tuple("ZYXF", 4)
+        == layout_string_to_tuple("zyxf", 4)
+        == (2, 1, 0, 3)
+    )
+    assert (
+        layout_string_to_tuple("SoA", 4)
+        == layout_string_to_tuple("soa", 4)
+        == layout_string_to_tuple("FZYX", 4)
+        == layout_string_to_tuple("fzyx", 4)
+        == layout_string_to_tuple("f", 4)
+        == layout_string_to_tuple("F", 4)
+        == (3, 2, 1, 0)
+    )
+    assert (
+        layout_string_to_tuple("c", 4)
+        == layout_string_to_tuple("C", 4)
+        == (0, 1, 2, 3)
+    )
+
+    assert layout_string_to_tuple("C", 5) == (0, 1, 2, 3, 4)
+
+    with pytest.raises(ValueError):
+        layout_string_to_tuple("aos", -1)
+
+    with pytest.raises(ValueError):
+        layout_string_to_tuple("aos", 5)
+
+
 def test_staggered():

    # D2Q5
-    j1, j2, j3 = ps.fields('j1(2), j2(2,2), j3(2,2,2) : double[2D]', field_type=FieldType.STAGGERED)
+    j1, j2, j3 = ps.fields(
+        "j1(2), j2(2,2), j3(2,2,2) : double[2D]", field_type=FieldType.STAGGERED
+    )

    assert j1[0, 1](1) == j1.staggered_access((0, sp.Rational(1, 2)))
    assert j1[0, 1](1) == j1.staggered_access(np.array((0, sp.Rational(1, 2))))
@@ -163,7 +252,7 @@ def test_staggered():
    assert j1[0, 1](1) == j1.staggered_access("N")
    assert j1[0, 0](1) == j1.staggered_access("S")
    assert j1.staggered_vector_access("N") == sp.Matrix([j1.staggered_access("N")])
-    assert j1.staggered_stencil_name == 'D2Q5'
+    assert j1.staggered_stencil_name == "D2Q5"

    assert j1.physical_coordinates[0] == TypedSymbol("ctr_0", create_type("int"), nonnegative=True)
    assert j1.physical_coordinates[1] == TypedSymbol("ctr_1", create_type("int"), nonnegative=True)
@@ -176,28 +265,40 @@ def test_staggered():

    assert j2[0, 1](1, 1) == j2.staggered_access((0, sp.Rational(1, 2)), 1)
    assert j2[0, 1](1, 1) == j2.staggered_access("N", 1)
-    assert j2.staggered_vector_access("N") == sp.Matrix([j2.staggered_access("N", 0), j2.staggered_access("N", 1)])
+    assert j2.staggered_vector_access("N") == sp.Matrix(
+        [j2.staggered_access("N", 0), j2.staggered_access("N", 1)]
+    )

    assert j3[0, 1](1, 1, 1) == j3.staggered_access((0, sp.Rational(1, 2)), (1, 1))
    assert j3[0, 1](1, 1, 1) == j3.staggered_access("N", (1, 1))
-    assert j3.staggered_vector_access("N") == sp.Matrix([[j3.staggered_access("N", (i, j))
-                                                        for j in range(2)] for i in range(2)])
+    assert j3.staggered_vector_access("N") == sp.Matrix(
+        [[j3.staggered_access("N", (i, j)) for j in range(2)] for i in range(2)]
+    )

    # D2Q9
-    k1, k2 = ps.fields('k1(4), k2(2) : double[2D]', field_type=FieldType.STAGGERED)
+    k1, k2 = ps.fields("k1(4), k2(2) : double[2D]", field_type=FieldType.STAGGERED)

    assert k1[1, 1](2) == k1.staggered_access("NE")
    assert k1[0, 0](2) == k1.staggered_access("SW")
    assert k1[0, 0](3) == k1.staggered_access("NW")
-    
+
    a = k1.staggered_access("NE")
-    assert a._staggered_offset(a.offsets, a.index[0]) == [sp.Rational(1, 2), sp.Rational(1, 2)]
+    assert a._staggered_offset(a.offsets, a.index[0]) == [
+        sp.Rational(1, 2),
+        sp.Rational(1, 2),
+    ]
    a = k1.staggered_access("SW")
-    assert a._staggered_offset(a.offsets, a.index[0]) == [sp.Rational(-1, 2), sp.Rational(-1, 2)]
+    assert a._staggered_offset(a.offsets, a.index[0]) == [
+        sp.Rational(-1, 2),
+        sp.Rational(-1, 2),
+    ]
    a = k1.staggered_access("NW")
-    assert a._staggered_offset(a.offsets, a.index[0]) == [sp.Rational(-1, 2), sp.Rational(1, 2)]
+    assert a._staggered_offset(a.offsets, a.index[0]) == [
+        sp.Rational(-1, 2),
+        sp.Rational(1, 2),
+    ]

    # sign reversed when using as flux field
-    r = ps.fields('r(2) : double[2D]', field_type=FieldType.STAGGERED_FLUX)
+    r = ps.fields("r(2) : double[2D]", field_type=FieldType.STAGGERED_FLUX)
    assert r[0, 0](0) == r.staggered_access("W")
    assert -r[1, 0](0) == r.staggered_access("E")
--- a/tests/test_gpu.py
+++ b/tests/test_gpu.py
 import pytest

 import numpy as np
-import cupy as cp
 import sympy as sp
+import math
 from scipy.ndimage import convolve

-from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
+from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target, get_code_str
 from pystencils.gpu import BlockIndexing
 from pystencils.simp import sympy_cse_on_assignment_list
 from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers, normalize_slice

 try:
-    import cupy
-    device_numbers = range(cupy.cuda.runtime.getDeviceCount())
+    import cupy as cp
+    device_numbers = range(cp.cuda.runtime.getDeviceCount())
 except ImportError:
    device_numbers = []
+    cp = None


 def test_averaging_kernel():
+    pytest.importorskip('cupy')
    size = (40, 55)
    src_arr = np.random.rand(*size)
    src_arr = add_ghost_layers(src_arr)
@@ -44,6 +46,7 @@ def test_averaging_kernel():


 def test_variable_sized_fields():
+    pytest.importorskip('cupy')
    src_field = Field.create_generic('src', spatial_dimensions=2)
    dst_field = Field.create_generic('dst', spatial_dimensions=2)

@@ -71,6 +74,7 @@ def test_variable_sized_fields():


 def test_multiple_index_dimensions():
+    pytest.importorskip('cupy')
    """Sums along the last axis of a numpy array"""
    src_size = (7, 6, 4)
    dst_size = src_size[:2]
@@ -103,6 +107,7 @@ def test_multiple_index_dimensions():


 def test_ghost_layer():
+    pytest.importorskip('cupy')
    size = (6, 5)
    src_arr = np.ones(size)
    dst_arr = np.zeros_like(src_arr)
@@ -127,6 +132,7 @@ def test_ghost_layer():


 def test_setting_value():
+    pytest.importorskip('cupy')
    arr_cpu = np.arange(25, dtype=np.float64).reshape(5, 5)
    arr_gpu = cp.asarray(arr_cpu)

@@ -143,6 +149,7 @@ def test_setting_value():


 def test_periodicity():
+    pytest.importorskip('cupy')
    from pystencils.gpu.periodicity import get_periodic_boundary_functor as periodic_gpu
    from pystencils.slicing import get_periodic_boundary_functor as periodic_cpu

@@ -163,6 +170,7 @@ def test_periodicity():

 @pytest.mark.parametrize("device_number", device_numbers)
 def test_block_indexing(device_number):
+    pytest.importorskip('cupy')
    f = fields("f: [3D]")
    s = normalize_slice(make_slice[:, :, :], f.spatial_shape)
    bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2),
@@ -195,6 +203,7 @@ def test_block_indexing(device_number):
 @pytest.mark.parametrize('layout', ("C", "F"))
 @pytest.mark.parametrize('shape', ((5, 5, 5, 5), (3, 17, 387, 4), (23, 44, 21, 11)))
 def test_four_dimensional_kernel(gpu_indexing, layout, shape):
+    pytest.importorskip('cupy')
    n_elements = np.prod(shape)

    arr_cpu = np.arange(n_elements, dtype=np.float64).reshape(shape, order=layout)
@@ -210,3 +219,39 @@ def test_four_dimensional_kernel(gpu_indexing, layout, shape):

    kernel(f=arr_gpu, value=np.float64(42.0))
    np.testing.assert_equal(arr_gpu.get(), np.ones(shape) * 42.0)
+
+
+@pytest.mark.parametrize('start', (1, 5))
+@pytest.mark.parametrize('end', (-1, -2, -3, -4))
+@pytest.mark.parametrize('step', (1, 2, 3, 4))
+@pytest.mark.parametrize('shape', ([55, 60], [77, 101, 80], [44, 64, 66]))
+def test_guards_with_iteration_slices(start, end, step, shape):
+    iter_slice = tuple([slice(start, end, step)] * len(shape))
+
+    kernel_config_gpu = CreateKernelConfig(target=Target.GPU, iteration_slice=iter_slice)
+    field_1 = fields(f"f(1) : double{list(shape)}")
+    assignment = Assignment(field_1.center, 1)
+    ast = create_kernel(assignment, config=kernel_config_gpu)
+    code_str = get_code_str(ast)
+
+    test_strings = list()
+    iteration_ranges = list()
+    for i, s in enumerate(iter_slice):
+        e = ((shape[i] + end) - s.start) / s.step
+        e = math.ceil(e) + s.start
+        test_strings.append(f"{s.start} < {e}")
+
+        a = s.start
+        counter = 0
+        while a < e:
+            a += 1
+            counter += 1
+        iteration_ranges.append(counter)
+
+    # check if the expected if statement is in the GPU code
+    for s in test_strings:
+        assert s in code_str
+
+    # check if these bounds lead to same lengths as the range function would produce
+    for i in range(len(iter_slice)):
+        assert iteration_ranges[i] == len(range(iter_slice[i].start, shape[i] + end, iter_slice[i].step))
No results found