From e77df63f672ea28cc389c1fadd695338f2985c7d Mon Sep 17 00:00:00 2001 From: Michael Kuron <m.kuron@gmx.de> Date: Sat, 10 Aug 2024 23:27:47 +0200 Subject: [PATCH] Add SVE nontemporal stores and scatters, including masked variants The added tests revealed a number of bugs in maskStore/maskStoreS of other instructions sets, which are also fixed. --- .gitlab-ci.yml | 2 +- .../backends/arm_instruction_sets.py | 36 +++++++---- src/pystencils/backends/cbackend.py | 38 +++++++++--- .../backends/riscv_instruction_sets.py | 4 +- .../backends/simd_instruction_sets.py | 10 +++- tests/test_conditional_vec.py | 59 ++++++++++++++++--- tests/test_random.py | 2 +- tests/test_vectorization.py | 6 +- tests/test_vectorization_specific.py | 15 +++-- 9 files changed, 130 insertions(+), 42 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1778561a7..fbb45987b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -195,7 +195,7 @@ arm64v9: image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64 before_script: - *multiarch_before_script - - sed -i s/march=native/march=armv8-a+sve+sme/g ~/.config/pystencils/config.json + - sed -i s/march=native/march=armv9-a+sve2+sme/g ~/.config/pystencils/config.json - sed -i s/g\+\+/clang++/g ~/.config/pystencils/config.json riscv64: diff --git a/src/pystencils/backends/arm_instruction_sets.py b/src/pystencils/backends/arm_instruction_sets.py index 3e50d5f45..227224f4e 100644 --- a/src/pystencils/backends/arm_instruction_sets.py +++ b/src/pystencils/backends/arm_instruction_sets.py @@ -18,8 +18,11 @@ def get_argument_string(function_shortcut, first=''): def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): if instruction_set not in ['neon', 'sme'] and not instruction_set.startswith('sve'): raise NotImplementedError(instruction_set) - if instruction_set in ['sve', 'sme']: + if instruction_set in ['sve', 'sve2', 'sme']: cmp = 'cmp' + elif instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'): + cmp = 'cmp' + bitwidth = int(instruction_set[4:]) elif instruction_set.startswith('sve'): cmp = 'cmp' bitwidth = int(instruction_set[3:]) @@ -52,7 +55,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): result = dict() - if instruction_set in ['sve', 'sme']: + if instruction_set in ['sve', 'sve2', 'sme']: width = 'svcntd()' if data_type == 'double' else 'svcntw()' intwidth = 'svcntw()' result['bytes'] = 'svcntb()' @@ -61,13 +64,14 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): intwidth = bitwidth // bits['int'] result['bytes'] = bitwidth // 8 if instruction_set.startswith('sve') or instruction_set == 'sme': + base_names['stream'] = 'stnt1[0, 1]' prefix = 'sv' suffix = f'_f{bits[data_type]}' elif instruction_set == 'neon': prefix = 'v' suffix = f'q_f{bits[data_type]}' - if instruction_set in ['sve', 'sme']: + if instruction_set in ['sve', 'sve2', 'sme']: predicate = f'{prefix}whilelt_b{bits[data_type]}_u64({{loop_counter}}, {{loop_stop}})' int_predicate = f'{prefix}whilelt_b{bits["int"]}_u64({{loop_counter}}, {{loop_stop}})' else: @@ -86,7 +90,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): result[intrinsic_id] = prefix + name + suffix + undef + arg_string - if instruction_set in ['sve', 'sme']: + if instruction_set in ['sve', 'sve2', 'sme']: from pystencils.backends.cbackend import CFunction result['width'] = CFunction(width, "int") result['intwidth'] = CFunction(intwidth, "int") @@ -105,15 +109,18 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): vindex.format("{2}") + ', {1})' result['loadS'] = f'svld1_gather_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \ vindex.format("{1}") + ')' + if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'): + result['streamS'] = f'svstnt1_scatter_u{bits[data_type]}offset_f{bits[data_type]}({predicate}, {{0}}, ' + \ + vindex.format(f"{{2}}*{bits[data_type]//8}") + ', {1})' result['+int'] = f"svadd_s{bits['int']}_x({int_predicate}, " + "{0}, {1})" - result['float'] = f'svfloat{bits["float"]}_{"s" if instruction_set not in ["sve", "sme"] else ""}t' - result['double'] = f'svfloat{bits["double"]}_{"s" if instruction_set not in ["sve", "sme"] else ""}t' - result['int'] = f'svint{bits["int"]}_{"s" if instruction_set not in ["sve", "sme"] else ""}t' - result['bool'] = f'svbool_{"s" if instruction_set not in ["sve", "sme"] else ""}t' + result['float'] = f'svfloat{bits["float"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t' + result['double'] = f'svfloat{bits["double"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t' + result['int'] = f'svint{bits["int"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t' + result['bool'] = f'svbool_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t' - result['headers'] = ['<arm_sve.h>', '"arm_neon_helpers.h"'] + result['headers'] = ['<arm_sve.h>', '<arm_acle.h>', '"arm_neon_helpers.h"'] result['&'] = f'svand_b_z({predicate},' + ' {0}, {1})' result['|'] = f'svorr_b_z({predicate},' + ' {0}, {1})' @@ -122,12 +129,17 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): result['all'] = f'svcntp_b{bits[data_type]}({predicate}, {{0}}) == {width}' result['maskStoreU'] = result['storeU'].replace(predicate, '{2}') + result['maskStream'] = result['stream'].replace(predicate, '{2}') if instruction_set != 'sme': result['maskStoreS'] = result['storeS'].replace(predicate, '{3}') + if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'): + result['maskStreamS'] = result['streamS'].replace(predicate, '{3}') + + result['streamFence'] = '__dmb(15)' if instruction_set == 'sme': result['function_prefix'] = '__attribute__((arm_locally_streaming))' - elif instruction_set not in ['sve', 'sme']: + elif instruction_set not in ['sve', 'sve2', 'sme']: result['compile_flags'] = [f'-msve-vector-bits={bitwidth}'] else: result['makeVecConst'] = f'vdupq_n_f{bits[data_type]}' + '({0})' @@ -152,7 +164,9 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0' result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff' + # SVE has real nontemporal stores, so we only need to zero cachlines on Neon + result['cachelineZero'] = 'cachelineZero((void*) {0})' + result['cachelineSize'] = 'cachelineSize()' - result['cachelineZero'] = 'cachelineZero((void*) {0})' return result diff --git a/src/pystencils/backends/cbackend.py b/src/pystencils/backends/cbackend.py index 7dbf84d37..657f60d2f 100644 --- a/src/pystencils/backends/cbackend.py +++ b/src/pystencils/backends/cbackend.py @@ -280,14 +280,25 @@ class CBackend: if type(lhs_type) is VectorType and isinstance(node.lhs, CastFunc): arg, data_type, aligned, nontemporal, mask, stride = node.lhs.args instr = 'storeU' - if aligned: + if nontemporal and 'storeA' not in self._vector_instruction_set and \ + 'stream' in self._vector_instruction_set: + instr = 'stream' + elif aligned: instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA' if mask != True: # NOQA - instr = 'maskStoreA' if aligned else 'maskStoreU' + instr = 'maskStream' if nontemporal and 'maskStream' in self._vector_instruction_set else \ + 'maskStoreA' if aligned else 'maskStoreU' if instr not in self._vector_instruction_set: - self._vector_instruction_set[instr] = self._vector_instruction_set['store' + instr[-1]].format( + if instr == 'maskStream' and 'stream' in self._vector_instruction_set: + store, load = 'stream', 'loadA' + elif (instr in ('maskStream', 'maskStoreA')) and 'storeA' in self._vector_instruction_set: + store, load = 'storeA', 'loadA' + else: + store, load = 'storeU', 'loadU' + load = load if load in self._vector_instruction_set else 'loadU' + self._vector_instruction_set[instr] = self._vector_instruction_set[store].format( '{0}', self._vector_instruction_set['blendv'].format( - self._vector_instruction_set['load' + instr[-1]].format('{0}', **self._kwargs), + self._vector_instruction_set[load].format('{0}', **self._kwargs), '{1}', '{2}', **self._kwargs), **self._kwargs) printed_mask = self.sympy_printer.doprint(mask) if data_type.base_type.c_name == 'double': @@ -312,12 +323,14 @@ class CBackend: ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0]) if stride != 1: - instr = 'maskStoreS' if mask != True else 'storeS' # NOQA + instr = ('maskStreamS' if nontemporal and 'maskStreamS' in self._vector_instruction_set else + 'maskStoreS') if mask != True else \ + ('streamS' if nontemporal and 'streamS' in self._vector_instruction_set else 'storeS') # NOQA return self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs), stride, printed_mask, **self._kwargs) + ';' pre_code = '' - if nontemporal and 'cachelineZero' in self._vector_instruction_set: + if nontemporal and 'cachelineZero' in self._vector_instruction_set and mask == True: # NOQA first_cond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0" offset = sp.Add(*[sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i)) * node.lhs.args[0].field.spatial_strides[i] for i in @@ -337,15 +350,22 @@ class CBackend: code2 = self._vector_instruction_set['flushCacheline'].format( ptr, self.sympy_printer.doprint(rhs), **self._kwargs) + ';' code = f"{code}\nif ({flushcond}) {{\n\t{code2}\n}}" - elif nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set: + elif aligned and nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set: lhs_hash = hashlib.sha1(self.sympy_printer.doprint(node.lhs).encode('ascii')).hexdigest()[:8] rhs_hash = hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8] tmpvar = f'_tmp_{lhs_hash}_{rhs_hash}' code = 'const ' + self._print(node.lhs.dtype).replace(' const', '') + ' ' + tmpvar + ' = ' \ + self.sympy_printer.doprint(rhs) + ';' code1 = self._vector_instruction_set[instr].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';' - code2 = self._vector_instruction_set['storeAAndFlushCacheline'].format(ptr, tmpvar, printed_mask, - **self._kwargs) + ';' + maskStore, store, load = 'maskStoreAAndFlushCacheline', 'storeAAndFlushCacheline', 'loadA' + instr2 = maskStore if mask != True else store # NOQA + if instr2 not in self._vector_instruction_set: + self._vector_instruction_set[maskStore] = self._vector_instruction_set[store].format( + '{0}', self._vector_instruction_set['blendv'].format( + self._vector_instruction_set[load].format('{0}', **self._kwargs), + '{1}', '{2}', **self._kwargs), + **self._kwargs) + code2 = self._vector_instruction_set[instr2].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';' code += f"\nif ({flushcond}) {{\n\t{code2}\n}} else {{\n\t{code1}\n}}" return pre_code + code else: diff --git a/src/pystencils/backends/riscv_instruction_sets.py b/src/pystencils/backends/riscv_instruction_sets.py index e456c2b8c..27f631e7f 100644 --- a/src/pystencils/backends/riscv_instruction_sets.py +++ b/src/pystencils/backends/riscv_instruction_sets.py @@ -34,7 +34,7 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'): 'maskStoreU': f'se{bits[data_type]}_v[2, 0, 1]', 'loadS': f'lse{bits[data_type]}_v[0, 1]', 'storeS': f'sse{bits[data_type]}_v[0, 2, 1]', - 'maskStoreS': f'sse{bits[data_type]}_v[2, 0, 3, 1]', + 'maskStoreS': f'sse{bits[data_type]}_v[3, 0, 2, 1]', 'abs': 'fabs_v[0]', '==': 'mfeq_vv[0, 1]', @@ -89,7 +89,7 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'): result['storeS'] = result['storeS'].replace('{2}', f'{{2}}*{bits[data_type]//8}') result['loadS'] = result['loadS'].replace('{1}', f'{{1}}*{bits[data_type]//8}') - result['maskStoreS'] = result['maskStoreS'].replace('{3}', f'{{3}}*{bits[data_type]//8}') + result['maskStoreS'] = result['maskStoreS'].replace('{2}', f'{{2}}*{bits[data_type]//8}') result['+int'] = f"vadd_vv_i{bits['int']}m1({{0}}, {{1}}, {int_vl})" diff --git a/src/pystencils/backends/simd_instruction_sets.py b/src/pystencils/backends/simd_instruction_sets.py index b94d9f374..ac6a626c3 100644 --- a/src/pystencils/backends/simd_instruction_sets.py +++ b/src/pystencils/backends/simd_instruction_sets.py @@ -45,15 +45,19 @@ def get_supported_instruction_sets(): result = ['neon'] # Neon is mandatory on 64-bit ARM libc = CDLL('libc.so.6') hwcap = libc.getauxval(16) # AT_HWCAP + hwcap2 = libc.getauxval(26) # AT_HWCAP2 if hwcap & (1 << 22): # HWCAP_SVE + if hwcap2 & (1 << 1): # HWCAP2_SVE2 + name = 'sve2' + else: + name = 'sve' length = 8 * libc.prctl(51, 0, 0, 0, 0) # PR_SVE_GET_VL if length < 0: raise OSError("SVE length query failed") while length >= 128: - result.append(f"sve{length}") + result.append(f"{name}{length}") length //= 2 - result.append("sve") - hwcap2 = libc.getauxval(26) # AT_HWCAP2 + result.append(name) if hwcap2 & (1 << 23): # HWCAP2_SME result.append("sme") return result diff --git a/tests/test_conditional_vec.py b/tests/test_conditional_vec.py index 032c8ab78..015dd08f7 100644 --- a/tests/test_conditional_vec.py +++ b/tests/test_conditional_vec.py @@ -3,6 +3,7 @@ import sympy as sp import pytest import pystencils as ps +from pystencils.alignedarray import aligned_zeros from pystencils.astnodes import Block, Conditional, SympyAssignment from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set from pystencils.enums import Target @@ -15,7 +16,7 @@ supported_instruction_sets = get_supported_instruction_sets() if get_supported_i @pytest.mark.parametrize('instruction_set', supported_instruction_sets) @pytest.mark.parametrize('dtype', ('float32', 'float64')) def test_vec_any(instruction_set, dtype): - if instruction_set in ['sve', 'sme', 'rvv']: + if instruction_set in ['sve', 'sve2', 'sme', 'rvv']: width = 4 # we don't know the actual value else: width = get_vector_instruction_set(dtype, instruction_set)['width'] @@ -34,7 +35,7 @@ def test_vec_any(instruction_set, dtype): cpu_vectorize_info={'instruction_set': instruction_set}) kernel = ast.compile() kernel(data=data_arr) - if instruction_set in ['sve', 'sme', 'rvv']: + if instruction_set in ['sve', 'sve2', 'sme', 'rvv']: # we only know that the first value has changed np.testing.assert_equal(data_arr[3:9, :3 * width - 1], 2.0) else: @@ -44,7 +45,7 @@ def test_vec_any(instruction_set, dtype): @pytest.mark.parametrize('instruction_set', supported_instruction_sets) @pytest.mark.parametrize('dtype', ('float32', 'float64')) def test_vec_all(instruction_set, dtype): - if instruction_set in ['sve', 'sme', 'rvv']: + if instruction_set in ['sve', 'sve2', 'sme', 'rvv']: width = 1000 # we don't know the actual value, need something guaranteed larger than vector else: width = get_vector_instruction_set(dtype, instruction_set)['width'] @@ -59,7 +60,7 @@ def test_vec_all(instruction_set, dtype): cpu_vectorize_info={'instruction_set': instruction_set}) kernel = ast.compile() kernel(data=data_arr) - if instruction_set in ['sve', 'sme', 'rvv']: + if instruction_set in ['sve', 'sve2', 'sme', 'rvv']: # we only know that some values in the middle have been replaced assert np.all(data_arr[3:9, :2] <= 1.0) assert np.any(data_arr[3:9, 2:] == 2.0) @@ -94,16 +95,60 @@ def test_boolean_before_loop(): @pytest.mark.parametrize('instruction_set', supported_instruction_sets) @pytest.mark.parametrize('dtype', ('float32', 'float64')) -def test_vec_maskstore(instruction_set, dtype): - data_arr = np.zeros((16, 16), dtype=dtype) +@pytest.mark.parametrize('nontemporal', [False, True]) +@pytest.mark.parametrize('aligned', [False, True]) +def test_vec_maskstore(instruction_set, dtype, nontemporal, aligned): + data_arr = (aligned_zeros if aligned else np.zeros)((16, 16), dtype=dtype) data_arr[3:-3, 3:-3] = 1.0 data = ps.fields(f"data: {dtype}[2D]", data=data_arr) c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))] assignmets = NodeCollection(c) - config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set}, default_number_float=dtype) + config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set, + 'nontemporal': nontemporal, + 'assume_aligned': aligned}, + default_number_float=dtype) ast = ps.create_kernel(assignmets, config=config) + if 'maskStore' in ast.instruction_set: + instruction = 'maskStream' if nontemporal and 'maskStream' in ast.instruction_set else ( + 'maskStoreA' if aligned and 'maskStoreA' in ast.instruction_set else 'maskStore') + assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast) + print(ps.get_code_str(ast)) + kernel = ast.compile() + kernel(data=data_arr) + np.testing.assert_equal(data_arr[:3, :], 2.0) + np.testing.assert_equal(data_arr[-3:, :], 2.0) + np.testing.assert_equal(data_arr[:, :3], 2.0) + np.testing.assert_equal(data_arr[:, -3:], 2.0) + np.testing.assert_equal(data_arr[3:-3, 3:-3], 1.0) + + +@pytest.mark.parametrize('instruction_set', supported_instruction_sets) +@pytest.mark.parametrize('dtype', ('float32', 'float64')) +@pytest.mark.parametrize('nontemporal', [False, True]) +def test_vec_maskscatter(instruction_set, dtype, nontemporal): + data_arr = np.zeros((16, 16), dtype=dtype) + data_arr[3:-3, 3:-3] = 1.0 + data = ps.fields(f"data: {dtype}[2D]") + + c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))] + + assignmets = NodeCollection(c) + config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set, + 'nontemporal': nontemporal}, + default_number_float=dtype) + if 'maskStoreS' not in get_vector_instruction_set(dtype, instruction_set) \ + and not instruction_set.startswith('sve'): + with pytest.warns(UserWarning) as warn: + ast = ps.create_kernel(assignmets, config=config) + assert 'Could not vectorize loop' in warn[0].message.args[0] + else: + with pytest.warns(None) as warn: + ast = ps.create_kernel(assignmets, config=config) + assert len(warn) == 0 + instruction = 'maskStreamS' if nontemporal and 'maskStreamS' in ast.instruction_set else 'maskStoreS' + assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast) print(ps.get_code_str(ast)) kernel = ast.compile() kernel(data=data_arr) diff --git a/tests/test_random.py b/tests/test_random.py index 21933e893..49e9583ae 100644 --- a/tests/test_random.py +++ b/tests/test_random.py @@ -32,7 +32,7 @@ if get_compiler_config()['os'] == 'windows': def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None): if target == Target.GPU: pytest.importorskip('cupy') - if instruction_sets and {'neon', 'sve', 'sme', 'vsx', 'rvv'}.intersection(instruction_sets) and rng == 'aesni': + if instruction_sets and {'neon', 'sve', 'sve2', 'sme', 'vsx', 'rvv'}.intersection(instruction_sets) and rng == 'aesni': pytest.xfail('AES not yet implemented for this architecture') if rng == 'aesni' and len(keys) == 2: keys *= 2 diff --git a/tests/test_vectorization.py b/tests/test_vectorization.py index d2350526e..729da033b 100644 --- a/tests/test_vectorization.py +++ b/tests/test_vectorization.py @@ -143,10 +143,10 @@ def test_aligned_and_nt_stores(openmp, instruction_set=instruction_set): # Without the base pointer spec, the inner store is not aligned config = pystencils.config.CreateKernelConfig(target=dh.default_target, cpu_vectorize_info=opt, cpu_openmp=openmp) ast = ps.create_kernel(update_rule, config=config) - if instruction_set in ['sse'] or instruction_set.startswith('avx'): + if instruction_set in ['sse'] or instruction_set.startswith('avx') or instruction_set.startswith('sve'): assert 'stream' in ast.instruction_set assert 'streamFence' in ast.instruction_set - if instruction_set in ['neon', 'sme', 'vsx', 'rvv'] or instruction_set.startswith('sve'): + if instruction_set in ['neon', 'vsx', 'rvv']: assert 'cachelineZero' in ast.instruction_set if instruction_set in ['vsx']: assert 'storeAAndFlushCacheline' in ast.instruction_set @@ -331,7 +331,7 @@ def test_logical_operators(instruction_set=instruction_set): def test_hardware_query(): - assert {'sse', 'neon', 'sve', 'sme', 'vsx', 'rvv'}.intersection(supported_instruction_sets) + assert {'sse', 'neon', 'sve', 'sve2', 'sme', 'vsx', 'rvv'}.intersection(supported_instruction_sets) def test_vectorised_pow(instruction_set=instruction_set): diff --git a/tests/test_vectorization_specific.py b/tests/test_vectorization_specific.py index dcebeae60..19c6e0033 100644 --- a/tests/test_vectorization_specific.py +++ b/tests/test_vectorization_specific.py @@ -60,22 +60,27 @@ def test_vectorized_abs(instruction_set, dtype): @pytest.mark.parametrize('dtype', ('float32', 'float64')) @pytest.mark.parametrize('instruction_set', supported_instruction_sets) -def test_strided(instruction_set, dtype): +@pytest.mark.parametrize('nontemporal', [False, True]) +def test_strided(instruction_set, dtype, nontemporal): f, g = ps.fields(f"f, g : {dtype}[2D]") update_rule = [ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0)] + config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set, + 'nontemporal': nontemporal}, + default_number_float=dtype) if 'storeS' not in get_vector_instruction_set(dtype, instruction_set) \ and instruction_set not in ['avx512', 'avx512vl', 'rvv'] and not instruction_set.startswith('sve'): with pytest.warns(UserWarning) as warn: - config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set}, - default_number_float=dtype) ast = ps.create_kernel(update_rule, config=config) assert 'Could not vectorize loop' in warn[0].message.args[0] else: with pytest.warns(None) as warn: - config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set}, - default_number_float=dtype) ast = ps.create_kernel(update_rule, config=config) assert len(warn) == 0 + instruction = 'streamS' if nontemporal and 'streamS' in ast.instruction_set else 'storeS' + assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast) + instruction = 'cachelineZero' + if instruction in ast.instruction_set: + assert ast.instruction_set[instruction] not in ps.get_code_str(ast) # ps.show_code(ast) func = ast.compile() -- GitLab