From e77df63f672ea28cc389c1fadd695338f2985c7d Mon Sep 17 00:00:00 2001
From: Michael Kuron <m.kuron@gmx.de>
Date: Sat, 10 Aug 2024 23:27:47 +0200
Subject: [PATCH] Add SVE nontemporal stores and scatters, including masked
 variants

The added tests revealed a number of bugs in maskStore/maskStoreS of other instructions sets, which are also fixed.
---
 .gitlab-ci.yml                                |  2 +-
 .../backends/arm_instruction_sets.py          | 36 +++++++----
 src/pystencils/backends/cbackend.py           | 38 +++++++++---
 .../backends/riscv_instruction_sets.py        |  4 +-
 .../backends/simd_instruction_sets.py         | 10 +++-
 tests/test_conditional_vec.py                 | 59 ++++++++++++++++---
 tests/test_random.py                          |  2 +-
 tests/test_vectorization.py                   |  6 +-
 tests/test_vectorization_specific.py          | 15 +++--
 9 files changed, 130 insertions(+), 42 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1778561a7..fbb45987b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -195,7 +195,7 @@ arm64v9:
   image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
   before_script:
     - *multiarch_before_script
-    - sed -i s/march=native/march=armv8-a+sve+sme/g ~/.config/pystencils/config.json
+    - sed -i s/march=native/march=armv9-a+sve2+sme/g ~/.config/pystencils/config.json
     - sed -i s/g\+\+/clang++/g ~/.config/pystencils/config.json
 
 riscv64:
diff --git a/src/pystencils/backends/arm_instruction_sets.py b/src/pystencils/backends/arm_instruction_sets.py
index 3e50d5f45..227224f4e 100644
--- a/src/pystencils/backends/arm_instruction_sets.py
+++ b/src/pystencils/backends/arm_instruction_sets.py
@@ -18,8 +18,11 @@ def get_argument_string(function_shortcut, first=''):
 def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
     if instruction_set not in ['neon', 'sme'] and not instruction_set.startswith('sve'):
         raise NotImplementedError(instruction_set)
-    if instruction_set in ['sve', 'sme']:
+    if instruction_set in ['sve', 'sve2', 'sme']:
         cmp = 'cmp'
+    elif instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
+        cmp = 'cmp'
+        bitwidth = int(instruction_set[4:])
     elif instruction_set.startswith('sve'):
         cmp = 'cmp'
         bitwidth = int(instruction_set[3:])
@@ -52,7 +55,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
 
     result = dict()
 
-    if instruction_set in ['sve', 'sme']:
+    if instruction_set in ['sve', 'sve2', 'sme']:
         width = 'svcntd()' if data_type == 'double' else 'svcntw()'
         intwidth = 'svcntw()'
         result['bytes'] = 'svcntb()'
@@ -61,13 +64,14 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
         intwidth = bitwidth // bits['int']
         result['bytes'] = bitwidth // 8
     if instruction_set.startswith('sve') or instruction_set == 'sme':
+        base_names['stream'] = 'stnt1[0, 1]'
         prefix = 'sv'
         suffix = f'_f{bits[data_type]}' 
     elif instruction_set == 'neon':
         prefix = 'v'
         suffix = f'q_f{bits[data_type]}' 
 
-    if instruction_set in ['sve', 'sme']:
+    if instruction_set in ['sve', 'sve2', 'sme']:
         predicate = f'{prefix}whilelt_b{bits[data_type]}_u64({{loop_counter}}, {{loop_stop}})'
         int_predicate = f'{prefix}whilelt_b{bits["int"]}_u64({{loop_counter}}, {{loop_stop}})'
     else:
@@ -86,7 +90,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
 
         result[intrinsic_id] = prefix + name + suffix + undef + arg_string
 
-    if instruction_set in ['sve', 'sme']:
+    if instruction_set in ['sve', 'sve2', 'sme']:
         from pystencils.backends.cbackend import CFunction
         result['width'] = CFunction(width, "int")
         result['intwidth'] = CFunction(intwidth, "int")
@@ -105,15 +109,18 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
                                vindex.format("{2}") + ', {1})'
             result['loadS'] = f'svld1_gather_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
                               vindex.format("{1}") + ')'
+        if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
+            result['streamS'] = f'svstnt1_scatter_u{bits[data_type]}offset_f{bits[data_type]}({predicate}, {{0}}, ' + \
+                                vindex.format(f"{{2}}*{bits[data_type]//8}") + ', {1})'
 
         result['+int'] = f"svadd_s{bits['int']}_x({int_predicate}, " + "{0}, {1})"
 
-        result['float'] = f'svfloat{bits["float"]}_{"s" if instruction_set not in ["sve", "sme"] else ""}t'
-        result['double'] = f'svfloat{bits["double"]}_{"s" if instruction_set not in ["sve", "sme"] else ""}t'
-        result['int'] = f'svint{bits["int"]}_{"s" if instruction_set not in ["sve", "sme"] else ""}t'
-        result['bool'] = f'svbool_{"s" if instruction_set not in ["sve", "sme"] else ""}t'
+        result['float'] = f'svfloat{bits["float"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
+        result['double'] = f'svfloat{bits["double"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
+        result['int'] = f'svint{bits["int"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
+        result['bool'] = f'svbool_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
 
-        result['headers'] = ['<arm_sve.h>', '"arm_neon_helpers.h"']
+        result['headers'] = ['<arm_sve.h>', '<arm_acle.h>', '"arm_neon_helpers.h"']
 
         result['&'] = f'svand_b_z({predicate},' + ' {0}, {1})'
         result['|'] = f'svorr_b_z({predicate},' + ' {0}, {1})'
@@ -122,12 +129,17 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
         result['all'] = f'svcntp_b{bits[data_type]}({predicate}, {{0}}) == {width}'
 
         result['maskStoreU'] = result['storeU'].replace(predicate, '{2}')
+        result['maskStream'] = result['stream'].replace(predicate, '{2}')
         if instruction_set != 'sme':
             result['maskStoreS'] = result['storeS'].replace(predicate, '{3}')
+            if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
+                result['maskStreamS'] = result['streamS'].replace(predicate, '{3}')
+        
+        result['streamFence'] = '__dmb(15)'
 
         if instruction_set == 'sme':
             result['function_prefix'] = '__attribute__((arm_locally_streaming))'
-        elif instruction_set not in ['sve', 'sme']:
+        elif instruction_set not in ['sve', 'sve2', 'sme']:
             result['compile_flags'] = [f'-msve-vector-bits={bitwidth}']
     else:
         result['makeVecConst'] = f'vdupq_n_f{bits[data_type]}' + '({0})'
@@ -152,7 +164,9 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
         result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0'
         result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff'
 
+        # SVE has real nontemporal stores, so we only need to zero cachlines on Neon
+        result['cachelineZero'] = 'cachelineZero((void*) {0})'
+
     result['cachelineSize'] = 'cachelineSize()'
-    result['cachelineZero'] = 'cachelineZero((void*) {0})'
 
     return result
diff --git a/src/pystencils/backends/cbackend.py b/src/pystencils/backends/cbackend.py
index 7dbf84d37..657f60d2f 100644
--- a/src/pystencils/backends/cbackend.py
+++ b/src/pystencils/backends/cbackend.py
@@ -280,14 +280,25 @@ class CBackend:
             if type(lhs_type) is VectorType and isinstance(node.lhs, CastFunc):
                 arg, data_type, aligned, nontemporal, mask, stride = node.lhs.args
                 instr = 'storeU'
-                if aligned:
+                if nontemporal and 'storeA' not in self._vector_instruction_set and \
+                        'stream' in self._vector_instruction_set:
+                    instr = 'stream'
+                elif aligned:
                     instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA'
                 if mask != True:  # NOQA
-                    instr = 'maskStoreA' if aligned else 'maskStoreU'
+                    instr = 'maskStream' if nontemporal and 'maskStream' in self._vector_instruction_set else \
+                            'maskStoreA' if aligned else 'maskStoreU'
                     if instr not in self._vector_instruction_set:
-                        self._vector_instruction_set[instr] = self._vector_instruction_set['store' + instr[-1]].format(
+                        if instr == 'maskStream' and 'stream' in self._vector_instruction_set:
+                            store, load = 'stream', 'loadA'
+                        elif (instr in ('maskStream', 'maskStoreA')) and 'storeA' in self._vector_instruction_set:
+                            store, load = 'storeA', 'loadA'
+                        else:
+                            store, load = 'storeU', 'loadU'
+                        load = load if load in self._vector_instruction_set else 'loadU'
+                        self._vector_instruction_set[instr] = self._vector_instruction_set[store].format(
                             '{0}', self._vector_instruction_set['blendv'].format(
-                                self._vector_instruction_set['load' + instr[-1]].format('{0}', **self._kwargs),
+                                self._vector_instruction_set[load].format('{0}', **self._kwargs),
                                 '{1}', '{2}', **self._kwargs), **self._kwargs)
                     printed_mask = self.sympy_printer.doprint(mask)
                     if data_type.base_type.c_name == 'double':
@@ -312,12 +323,14 @@ class CBackend:
                 ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0])
 
                 if stride != 1:
-                    instr = 'maskStoreS' if mask != True else 'storeS'  # NOQA
+                    instr = ('maskStreamS' if nontemporal and 'maskStreamS' in self._vector_instruction_set else
+                             'maskStoreS') if mask != True else \
+                            ('streamS' if nontemporal and 'streamS' in self._vector_instruction_set else 'storeS')  # NOQA
                     return self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs),
                                                                       stride, printed_mask, **self._kwargs) + ';'
 
                 pre_code = ''
-                if nontemporal and 'cachelineZero' in self._vector_instruction_set:
+                if nontemporal and 'cachelineZero' in self._vector_instruction_set and mask == True:  # NOQA
                     first_cond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0"
                     offset = sp.Add(*[sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i))
                                       * node.lhs.args[0].field.spatial_strides[i] for i in
@@ -337,15 +350,22 @@ class CBackend:
                     code2 = self._vector_instruction_set['flushCacheline'].format(
                         ptr, self.sympy_printer.doprint(rhs), **self._kwargs) + ';'
                     code = f"{code}\nif ({flushcond}) {{\n\t{code2}\n}}"
-                elif nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set:
+                elif aligned and nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set:
                     lhs_hash = hashlib.sha1(self.sympy_printer.doprint(node.lhs).encode('ascii')).hexdigest()[:8]
                     rhs_hash = hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8]
                     tmpvar = f'_tmp_{lhs_hash}_{rhs_hash}'
                     code = 'const ' + self._print(node.lhs.dtype).replace(' const', '') + ' ' + tmpvar + ' = ' \
                         + self.sympy_printer.doprint(rhs) + ';'
                     code1 = self._vector_instruction_set[instr].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';'
-                    code2 = self._vector_instruction_set['storeAAndFlushCacheline'].format(ptr, tmpvar, printed_mask,
-                                                                                           **self._kwargs) + ';'
+                    maskStore, store, load = 'maskStoreAAndFlushCacheline', 'storeAAndFlushCacheline', 'loadA'
+                    instr2 = maskStore if mask != True else store  # NOQA
+                    if instr2 not in self._vector_instruction_set:
+                        self._vector_instruction_set[maskStore] = self._vector_instruction_set[store].format(
+                            '{0}', self._vector_instruction_set['blendv'].format(
+                                self._vector_instruction_set[load].format('{0}', **self._kwargs),
+                                '{1}', '{2}', **self._kwargs),
+                            **self._kwargs)
+                    code2 = self._vector_instruction_set[instr2].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';'
                     code += f"\nif ({flushcond}) {{\n\t{code2}\n}} else {{\n\t{code1}\n}}"
                 return pre_code + code
             else:
diff --git a/src/pystencils/backends/riscv_instruction_sets.py b/src/pystencils/backends/riscv_instruction_sets.py
index e456c2b8c..27f631e7f 100644
--- a/src/pystencils/backends/riscv_instruction_sets.py
+++ b/src/pystencils/backends/riscv_instruction_sets.py
@@ -34,7 +34,7 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
         'maskStoreU': f'se{bits[data_type]}_v[2, 0, 1]',
         'loadS': f'lse{bits[data_type]}_v[0, 1]',
         'storeS': f'sse{bits[data_type]}_v[0, 2, 1]',
-        'maskStoreS': f'sse{bits[data_type]}_v[2, 0, 3, 1]',
+        'maskStoreS': f'sse{bits[data_type]}_v[3, 0, 2, 1]',
 
         'abs': 'fabs_v[0]',
         '==': 'mfeq_vv[0, 1]',
@@ -89,7 +89,7 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
 
     result['storeS'] = result['storeS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
     result['loadS'] = result['loadS'].replace('{1}', f'{{1}}*{bits[data_type]//8}')
-    result['maskStoreS'] = result['maskStoreS'].replace('{3}', f'{{3}}*{bits[data_type]//8}')
+    result['maskStoreS'] = result['maskStoreS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
 
     result['+int'] = f"vadd_vv_i{bits['int']}m1({{0}}, {{1}}, {int_vl})"
 
diff --git a/src/pystencils/backends/simd_instruction_sets.py b/src/pystencils/backends/simd_instruction_sets.py
index b94d9f374..ac6a626c3 100644
--- a/src/pystencils/backends/simd_instruction_sets.py
+++ b/src/pystencils/backends/simd_instruction_sets.py
@@ -45,15 +45,19 @@ def get_supported_instruction_sets():
         result = ['neon']  # Neon is mandatory on 64-bit ARM
         libc = CDLL('libc.so.6')
         hwcap = libc.getauxval(16)  # AT_HWCAP
+        hwcap2 = libc.getauxval(26)  # AT_HWCAP2
         if hwcap & (1 << 22):  # HWCAP_SVE
+            if hwcap2 & (1 << 1):  # HWCAP2_SVE2
+                name = 'sve2'
+            else:
+                name = 'sve'
             length = 8 * libc.prctl(51, 0, 0, 0, 0)  # PR_SVE_GET_VL
             if length < 0:
                 raise OSError("SVE length query failed")
             while length >= 128:
-                result.append(f"sve{length}")
+                result.append(f"{name}{length}")
                 length //= 2
-            result.append("sve")
-        hwcap2 = libc.getauxval(26)  # AT_HWCAP2
+            result.append(name)
         if hwcap2 & (1 << 23):  # HWCAP2_SME
             result.append("sme")
         return result
diff --git a/tests/test_conditional_vec.py b/tests/test_conditional_vec.py
index 032c8ab78..015dd08f7 100644
--- a/tests/test_conditional_vec.py
+++ b/tests/test_conditional_vec.py
@@ -3,6 +3,7 @@ import sympy as sp
 import pytest
 
 import pystencils as ps
+from pystencils.alignedarray import aligned_zeros
 from pystencils.astnodes import Block, Conditional, SympyAssignment
 from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
 from pystencils.enums import Target
@@ -15,7 +16,7 @@ supported_instruction_sets = get_supported_instruction_sets() if get_supported_i
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
 @pytest.mark.parametrize('dtype', ('float32', 'float64'))
 def test_vec_any(instruction_set, dtype):
-    if instruction_set in ['sve', 'sme', 'rvv']:
+    if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
         width = 4  # we don't know the actual value
     else:
         width = get_vector_instruction_set(dtype, instruction_set)['width']
@@ -34,7 +35,7 @@ def test_vec_any(instruction_set, dtype):
                            cpu_vectorize_info={'instruction_set': instruction_set})
     kernel = ast.compile()
     kernel(data=data_arr)
-    if instruction_set in ['sve', 'sme', 'rvv']:
+    if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
         # we only know that the first value has changed
         np.testing.assert_equal(data_arr[3:9, :3 * width - 1], 2.0)
     else:
@@ -44,7 +45,7 @@ def test_vec_any(instruction_set, dtype):
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
 @pytest.mark.parametrize('dtype', ('float32', 'float64'))
 def test_vec_all(instruction_set, dtype):
-    if instruction_set in ['sve', 'sme', 'rvv']:
+    if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
         width = 1000  # we don't know the actual value, need something guaranteed larger than vector
     else:
         width = get_vector_instruction_set(dtype, instruction_set)['width']
@@ -59,7 +60,7 @@ def test_vec_all(instruction_set, dtype):
                            cpu_vectorize_info={'instruction_set': instruction_set})
     kernel = ast.compile()
     kernel(data=data_arr)
-    if instruction_set in ['sve', 'sme', 'rvv']:
+    if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
         # we only know that some values in the middle have been replaced
         assert np.all(data_arr[3:9, :2] <= 1.0)
         assert np.any(data_arr[3:9, 2:] == 2.0)
@@ -94,16 +95,60 @@ def test_boolean_before_loop():
 
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
 @pytest.mark.parametrize('dtype', ('float32', 'float64'))
-def test_vec_maskstore(instruction_set, dtype):
-    data_arr = np.zeros((16, 16), dtype=dtype)
+@pytest.mark.parametrize('nontemporal', [False, True])
+@pytest.mark.parametrize('aligned', [False, True])
+def test_vec_maskstore(instruction_set, dtype, nontemporal, aligned):
+    data_arr = (aligned_zeros if aligned else np.zeros)((16, 16), dtype=dtype)
     data_arr[3:-3, 3:-3] = 1.0
     data = ps.fields(f"data: {dtype}[2D]", data=data_arr)
 
     c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))]
 
     assignmets = NodeCollection(c)
-    config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set}, default_number_float=dtype)
+    config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set,
+                                                       'nontemporal': nontemporal,
+                                                       'assume_aligned': aligned},
+                                   default_number_float=dtype)
     ast = ps.create_kernel(assignmets, config=config)
+    if 'maskStore' in ast.instruction_set:
+        instruction = 'maskStream' if nontemporal and 'maskStream' in ast.instruction_set else (
+                      'maskStoreA' if aligned and 'maskStoreA' in ast.instruction_set else 'maskStore')
+        assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
+    print(ps.get_code_str(ast))
+    kernel = ast.compile()
+    kernel(data=data_arr)
+    np.testing.assert_equal(data_arr[:3, :], 2.0)
+    np.testing.assert_equal(data_arr[-3:, :], 2.0)
+    np.testing.assert_equal(data_arr[:, :3], 2.0)
+    np.testing.assert_equal(data_arr[:, -3:], 2.0)
+    np.testing.assert_equal(data_arr[3:-3, 3:-3], 1.0)
+
+
+@pytest.mark.parametrize('instruction_set', supported_instruction_sets)
+@pytest.mark.parametrize('dtype', ('float32', 'float64'))
+@pytest.mark.parametrize('nontemporal', [False, True])
+def test_vec_maskscatter(instruction_set, dtype, nontemporal):
+    data_arr = np.zeros((16, 16), dtype=dtype)
+    data_arr[3:-3, 3:-3] = 1.0
+    data = ps.fields(f"data: {dtype}[2D]")
+
+    c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))]
+
+    assignmets = NodeCollection(c)
+    config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set,
+                                                       'nontemporal': nontemporal},
+                                   default_number_float=dtype)
+    if 'maskStoreS' not in get_vector_instruction_set(dtype, instruction_set) \
+            and not instruction_set.startswith('sve'):
+        with pytest.warns(UserWarning) as warn:
+            ast = ps.create_kernel(assignmets, config=config)
+            assert 'Could not vectorize loop' in warn[0].message.args[0]
+    else:
+        with pytest.warns(None) as warn:
+            ast = ps.create_kernel(assignmets, config=config)
+            assert len(warn) == 0
+        instruction = 'maskStreamS' if nontemporal and 'maskStreamS' in ast.instruction_set else 'maskStoreS'
+        assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
     print(ps.get_code_str(ast))
     kernel = ast.compile()
     kernel(data=data_arr)
diff --git a/tests/test_random.py b/tests/test_random.py
index 21933e893..49e9583ae 100644
--- a/tests/test_random.py
+++ b/tests/test_random.py
@@ -32,7 +32,7 @@ if get_compiler_config()['os'] == 'windows':
 def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None):
     if target == Target.GPU:
         pytest.importorskip('cupy')
-    if instruction_sets and {'neon', 'sve', 'sme', 'vsx', 'rvv'}.intersection(instruction_sets) and rng == 'aesni':
+    if instruction_sets and {'neon', 'sve', 'sve2', 'sme', 'vsx', 'rvv'}.intersection(instruction_sets) and rng == 'aesni':
         pytest.xfail('AES not yet implemented for this architecture')
     if rng == 'aesni' and len(keys) == 2:
         keys *= 2
diff --git a/tests/test_vectorization.py b/tests/test_vectorization.py
index d2350526e..729da033b 100644
--- a/tests/test_vectorization.py
+++ b/tests/test_vectorization.py
@@ -143,10 +143,10 @@ def test_aligned_and_nt_stores(openmp, instruction_set=instruction_set):
     # Without the base pointer spec, the inner store is not aligned
     config = pystencils.config.CreateKernelConfig(target=dh.default_target, cpu_vectorize_info=opt, cpu_openmp=openmp)
     ast = ps.create_kernel(update_rule, config=config)
-    if instruction_set in ['sse'] or instruction_set.startswith('avx'):
+    if instruction_set in ['sse'] or instruction_set.startswith('avx') or instruction_set.startswith('sve'):
         assert 'stream' in ast.instruction_set
         assert 'streamFence' in ast.instruction_set
-    if instruction_set in ['neon', 'sme', 'vsx', 'rvv'] or instruction_set.startswith('sve'):
+    if instruction_set in ['neon', 'vsx', 'rvv']:
         assert 'cachelineZero' in ast.instruction_set
     if instruction_set in ['vsx']:
         assert 'storeAAndFlushCacheline' in ast.instruction_set
@@ -331,7 +331,7 @@ def test_logical_operators(instruction_set=instruction_set):
 
 
 def test_hardware_query():
-    assert {'sse', 'neon', 'sve', 'sme', 'vsx', 'rvv'}.intersection(supported_instruction_sets)
+    assert {'sse', 'neon', 'sve', 'sve2', 'sme', 'vsx', 'rvv'}.intersection(supported_instruction_sets)
 
 
 def test_vectorised_pow(instruction_set=instruction_set):
diff --git a/tests/test_vectorization_specific.py b/tests/test_vectorization_specific.py
index dcebeae60..19c6e0033 100644
--- a/tests/test_vectorization_specific.py
+++ b/tests/test_vectorization_specific.py
@@ -60,22 +60,27 @@ def test_vectorized_abs(instruction_set, dtype):
 
 @pytest.mark.parametrize('dtype', ('float32', 'float64'))
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
-def test_strided(instruction_set, dtype):
+@pytest.mark.parametrize('nontemporal', [False, True])
+def test_strided(instruction_set, dtype, nontemporal):
     f, g = ps.fields(f"f, g : {dtype}[2D]")
     update_rule = [ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0)]
+    config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set,
+                                                                      'nontemporal': nontemporal},
+                                                  default_number_float=dtype)
     if 'storeS' not in get_vector_instruction_set(dtype, instruction_set) \
             and instruction_set not in ['avx512', 'avx512vl', 'rvv'] and not instruction_set.startswith('sve'):
         with pytest.warns(UserWarning) as warn:
-            config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set},
-                                                          default_number_float=dtype)
             ast = ps.create_kernel(update_rule, config=config)
             assert 'Could not vectorize loop' in warn[0].message.args[0]
     else:
         with pytest.warns(None) as warn:
-            config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set},
-                                                          default_number_float=dtype)
             ast = ps.create_kernel(update_rule, config=config)
             assert len(warn) == 0
+        instruction = 'streamS' if nontemporal and 'streamS' in ast.instruction_set else 'storeS'
+        assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
+    instruction = 'cachelineZero'
+    if instruction in ast.instruction_set:
+        assert ast.instruction_set[instruction] not in ps.get_code_str(ast)
 
     # ps.show_code(ast)
     func = ast.compile()
-- 
GitLab