From 9be2dd01b8b9453f4e60d6859b079b04d156846c Mon Sep 17 00:00:00 2001 From: Michael Kuron <m.kuron@gmx.de> Date: Sat, 10 Aug 2024 23:27:47 +0200 Subject: [PATCH] Add SVE nontemporal stores and scatters, including masked variants --- .gitlab-ci.yml | 2 +- .../backends/arm_instruction_sets.py | 36 +++++++++++----- src/pystencils/backends/cbackend.py | 6 ++- .../backends/simd_instruction_sets.py | 10 +++-- tests/test_conditional_vec.py | 41 +++++++++++++++++-- tests/test_random.py | 2 +- tests/test_vectorization.py | 4 +- tests/test_vectorization_specific.py | 10 ++--- 8 files changed, 82 insertions(+), 29 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1778561a..fbb45987 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -195,7 +195,7 @@ arm64v9: image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64 before_script: - *multiarch_before_script - - sed -i s/march=native/march=armv8-a+sve+sme/g ~/.config/pystencils/config.json + - sed -i s/march=native/march=armv9-a+sve2+sme/g ~/.config/pystencils/config.json - sed -i s/g\+\+/clang++/g ~/.config/pystencils/config.json riscv64: diff --git a/src/pystencils/backends/arm_instruction_sets.py b/src/pystencils/backends/arm_instruction_sets.py index 3e50d5f4..5e650c03 100644 --- a/src/pystencils/backends/arm_instruction_sets.py +++ b/src/pystencils/backends/arm_instruction_sets.py @@ -18,8 +18,11 @@ def get_argument_string(function_shortcut, first=''): def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): if instruction_set not in ['neon', 'sme'] and not instruction_set.startswith('sve'): raise NotImplementedError(instruction_set) - if instruction_set in ['sve', 'sme']: + if instruction_set in ['sve', 'sve2', 'sme']: cmp = 'cmp' + elif instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'): + cmp = 'cmp' + bitwidth = int(instruction_set[4:]) elif instruction_set.startswith('sve'): cmp = 'cmp' bitwidth = int(instruction_set[3:]) @@ -52,7 +55,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): result = dict() - if instruction_set in ['sve', 'sme']: + if instruction_set in ['sve', 'sve2', 'sme']: width = 'svcntd()' if data_type == 'double' else 'svcntw()' intwidth = 'svcntw()' result['bytes'] = 'svcntb()' @@ -61,13 +64,14 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): intwidth = bitwidth // bits['int'] result['bytes'] = bitwidth // 8 if instruction_set.startswith('sve') or instruction_set == 'sme': + base_names['stream'] = 'stnt1[0, 1]' prefix = 'sv' suffix = f'_f{bits[data_type]}' elif instruction_set == 'neon': prefix = 'v' suffix = f'q_f{bits[data_type]}' - if instruction_set in ['sve', 'sme']: + if instruction_set in ['sve', 'sve2', 'sme']: predicate = f'{prefix}whilelt_b{bits[data_type]}_u64({{loop_counter}}, {{loop_stop}})' int_predicate = f'{prefix}whilelt_b{bits["int"]}_u64({{loop_counter}}, {{loop_stop}})' else: @@ -86,7 +90,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): result[intrinsic_id] = prefix + name + suffix + undef + arg_string - if instruction_set in ['sve', 'sme']: + if instruction_set in ['sve', 'sve2', 'sme']: from pystencils.backends.cbackend import CFunction result['width'] = CFunction(width, "int") result['intwidth'] = CFunction(intwidth, "int") @@ -105,15 +109,18 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): vindex.format("{2}") + ', {1})' result['loadS'] = f'svld1_gather_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \ vindex.format("{1}") + ')' + if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'): + result['streamS'] = f'svstnt1_scatter_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \ + vindex.format("{2}") + ', {1})' result['+int'] = f"svadd_s{bits['int']}_x({int_predicate}, " + "{0}, {1})" - result['float'] = f'svfloat{bits["float"]}_{"s" if instruction_set not in ["sve", "sme"] else ""}t' - result['double'] = f'svfloat{bits["double"]}_{"s" if instruction_set not in ["sve", "sme"] else ""}t' - result['int'] = f'svint{bits["int"]}_{"s" if instruction_set not in ["sve", "sme"] else ""}t' - result['bool'] = f'svbool_{"s" if instruction_set not in ["sve", "sme"] else ""}t' + result['float'] = f'svfloat{bits["float"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t' + result['double'] = f'svfloat{bits["double"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t' + result['int'] = f'svint{bits["int"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t' + result['bool'] = f'svbool_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t' - result['headers'] = ['<arm_sve.h>', '"arm_neon_helpers.h"'] + result['headers'] = ['<arm_sve.h>', '<arm_acle.h>', '"arm_neon_helpers.h"'] result['&'] = f'svand_b_z({predicate},' + ' {0}, {1})' result['|'] = f'svorr_b_z({predicate},' + ' {0}, {1})' @@ -122,12 +129,17 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): result['all'] = f'svcntp_b{bits[data_type]}({predicate}, {{0}}) == {width}' result['maskStoreU'] = result['storeU'].replace(predicate, '{2}') + result['maskStream'] = result['stream'].replace(predicate, '{2}') if instruction_set != 'sme': result['maskStoreS'] = result['storeS'].replace(predicate, '{3}') + if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'): + result['maskStreamS'] = result['streamS'].replace(predicate, '{3}') + + result['streamFence'] = '__dmb(15)' if instruction_set == 'sme': result['function_prefix'] = '__attribute__((arm_locally_streaming))' - elif instruction_set not in ['sve', 'sme']: + elif instruction_set not in ['sve', 'sve2', 'sme']: result['compile_flags'] = [f'-msve-vector-bits={bitwidth}'] else: result['makeVecConst'] = f'vdupq_n_f{bits[data_type]}' + '({0})' @@ -152,7 +164,9 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0' result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff' + # SVE has real nontemporal stores, so we only need to zero cachlines on Neon + result['cachelineZero'] = 'cachelineZero((void*) {0})' + result['cachelineSize'] = 'cachelineSize()' - result['cachelineZero'] = 'cachelineZero((void*) {0})' return result diff --git a/src/pystencils/backends/cbackend.py b/src/pystencils/backends/cbackend.py index 7dbf84d3..22c1f796 100644 --- a/src/pystencils/backends/cbackend.py +++ b/src/pystencils/backends/cbackend.py @@ -283,7 +283,8 @@ class CBackend: if aligned: instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA' if mask != True: # NOQA - instr = 'maskStoreA' if aligned else 'maskStoreU' + instr = 'maskStream' if nontemporal and 'maskStream' in self._vector_instruction_set else \ + 'maskStoreA' if aligned else 'maskStoreU' if instr not in self._vector_instruction_set: self._vector_instruction_set[instr] = self._vector_instruction_set['store' + instr[-1]].format( '{0}', self._vector_instruction_set['blendv'].format( @@ -312,7 +313,8 @@ class CBackend: ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0]) if stride != 1: - instr = 'maskStoreS' if mask != True else 'storeS' # NOQA + instr = ('maskStreamS' if nontemporal and 'maskStreamS' in self._vector_instruction_set else + 'maskStoreS') if mask != True else ('streamS' if nontemporal else 'storeS') # NOQA return self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs), stride, printed_mask, **self._kwargs) + ';' diff --git a/src/pystencils/backends/simd_instruction_sets.py b/src/pystencils/backends/simd_instruction_sets.py index b94d9f37..ac6a626c 100644 --- a/src/pystencils/backends/simd_instruction_sets.py +++ b/src/pystencils/backends/simd_instruction_sets.py @@ -45,15 +45,19 @@ def get_supported_instruction_sets(): result = ['neon'] # Neon is mandatory on 64-bit ARM libc = CDLL('libc.so.6') hwcap = libc.getauxval(16) # AT_HWCAP + hwcap2 = libc.getauxval(26) # AT_HWCAP2 if hwcap & (1 << 22): # HWCAP_SVE + if hwcap2 & (1 << 1): # HWCAP2_SVE2 + name = 'sve2' + else: + name = 'sve' length = 8 * libc.prctl(51, 0, 0, 0, 0) # PR_SVE_GET_VL if length < 0: raise OSError("SVE length query failed") while length >= 128: - result.append(f"sve{length}") + result.append(f"{name}{length}") length //= 2 - result.append("sve") - hwcap2 = libc.getauxval(26) # AT_HWCAP2 + result.append(name) if hwcap2 & (1 << 23): # HWCAP2_SME result.append("sme") return result diff --git a/tests/test_conditional_vec.py b/tests/test_conditional_vec.py index 032c8ab7..84d67f0c 100644 --- a/tests/test_conditional_vec.py +++ b/tests/test_conditional_vec.py @@ -15,7 +15,7 @@ supported_instruction_sets = get_supported_instruction_sets() if get_supported_i @pytest.mark.parametrize('instruction_set', supported_instruction_sets) @pytest.mark.parametrize('dtype', ('float32', 'float64')) def test_vec_any(instruction_set, dtype): - if instruction_set in ['sve', 'sme', 'rvv']: + if instruction_set in ['sve', 'sve2', 'sme', 'rvv']: width = 4 # we don't know the actual value else: width = get_vector_instruction_set(dtype, instruction_set)['width'] @@ -34,7 +34,7 @@ def test_vec_any(instruction_set, dtype): cpu_vectorize_info={'instruction_set': instruction_set}) kernel = ast.compile() kernel(data=data_arr) - if instruction_set in ['sve', 'sme', 'rvv']: + if instruction_set in ['sve', 'sve2', 'sme', 'rvv']: # we only know that the first value has changed np.testing.assert_equal(data_arr[3:9, :3 * width - 1], 2.0) else: @@ -44,7 +44,7 @@ def test_vec_any(instruction_set, dtype): @pytest.mark.parametrize('instruction_set', supported_instruction_sets) @pytest.mark.parametrize('dtype', ('float32', 'float64')) def test_vec_all(instruction_set, dtype): - if instruction_set in ['sve', 'sme', 'rvv']: + if instruction_set in ['sve', 'sve2', 'sme', 'rvv']: width = 1000 # we don't know the actual value, need something guaranteed larger than vector else: width = get_vector_instruction_set(dtype, instruction_set)['width'] @@ -59,7 +59,7 @@ def test_vec_all(instruction_set, dtype): cpu_vectorize_info={'instruction_set': instruction_set}) kernel = ast.compile() kernel(data=data_arr) - if instruction_set in ['sve', 'sme', 'rvv']: + if instruction_set in ['sve', 'sve2', 'sme', 'rvv']: # we only know that some values in the middle have been replaced assert np.all(data_arr[3:9, :2] <= 1.0) assert np.any(data_arr[3:9, 2:] == 2.0) @@ -112,3 +112,36 @@ def test_vec_maskstore(instruction_set, dtype): np.testing.assert_equal(data_arr[:, :3], 2.0) np.testing.assert_equal(data_arr[:, -3:], 2.0) np.testing.assert_equal(data_arr[3:-3, 3:-3], 1.0) + + +@pytest.mark.parametrize('instruction_set', supported_instruction_sets) +@pytest.mark.parametrize('dtype', ('float32', 'float64')) +@pytest.mark.parametrize('nontemporal', [False, True]) +def test_vec_maskscatter(instruction_set, dtype, nontemporal): + data_arr = np.zeros((16, 16), dtype=dtype) + data_arr[3:-3, 3:-3] = 1.0 + data = ps.fields(f"data: {dtype}[2D]") + + c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))] + + assignmets = NodeCollection(c) + config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set, + 'nontemporal': nontemporal}, + default_number_float=dtype) + if 'maskStoreS' not in get_vector_instruction_set(dtype, instruction_set) \ + and not instruction_set.startswith('sve'): + with pytest.warns(UserWarning) as warn: + ast = ps.create_kernel(assignmets, config=config) + assert 'Could not vectorize loop' in warn[0].message.args[0] + else: + with pytest.warns(None) as warn: + ast = ps.create_kernel(assignmets, config=config) + assert len(warn) == 0 + print(ps.get_code_str(ast)) + kernel = ast.compile() + kernel(data=data_arr) + np.testing.assert_equal(data_arr[:3, :], 2.0) + np.testing.assert_equal(data_arr[-3:, :], 2.0) + np.testing.assert_equal(data_arr[:, :3], 2.0) + np.testing.assert_equal(data_arr[:, -3:], 2.0) + np.testing.assert_equal(data_arr[3:-3, 3:-3], 1.0) diff --git a/tests/test_random.py b/tests/test_random.py index 21933e89..49e9583a 100644 --- a/tests/test_random.py +++ b/tests/test_random.py @@ -32,7 +32,7 @@ if get_compiler_config()['os'] == 'windows': def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None): if target == Target.GPU: pytest.importorskip('cupy') - if instruction_sets and {'neon', 'sve', 'sme', 'vsx', 'rvv'}.intersection(instruction_sets) and rng == 'aesni': + if instruction_sets and {'neon', 'sve', 'sve2', 'sme', 'vsx', 'rvv'}.intersection(instruction_sets) and rng == 'aesni': pytest.xfail('AES not yet implemented for this architecture') if rng == 'aesni' and len(keys) == 2: keys *= 2 diff --git a/tests/test_vectorization.py b/tests/test_vectorization.py index d2350526..7d487062 100644 --- a/tests/test_vectorization.py +++ b/tests/test_vectorization.py @@ -146,7 +146,7 @@ def test_aligned_and_nt_stores(openmp, instruction_set=instruction_set): if instruction_set in ['sse'] or instruction_set.startswith('avx'): assert 'stream' in ast.instruction_set assert 'streamFence' in ast.instruction_set - if instruction_set in ['neon', 'sme', 'vsx', 'rvv'] or instruction_set.startswith('sve'): + if instruction_set in ['neon', 'vsx', 'rvv']: assert 'cachelineZero' in ast.instruction_set if instruction_set in ['vsx']: assert 'storeAAndFlushCacheline' in ast.instruction_set @@ -331,7 +331,7 @@ def test_logical_operators(instruction_set=instruction_set): def test_hardware_query(): - assert {'sse', 'neon', 'sve', 'sme', 'vsx', 'rvv'}.intersection(supported_instruction_sets) + assert {'sse', 'neon', 'sve', 'sve2', 'sme', 'vsx', 'rvv'}.intersection(supported_instruction_sets) def test_vectorised_pow(instruction_set=instruction_set): diff --git a/tests/test_vectorization_specific.py b/tests/test_vectorization_specific.py index dcebeae6..2dd31b05 100644 --- a/tests/test_vectorization_specific.py +++ b/tests/test_vectorization_specific.py @@ -60,20 +60,20 @@ def test_vectorized_abs(instruction_set, dtype): @pytest.mark.parametrize('dtype', ('float32', 'float64')) @pytest.mark.parametrize('instruction_set', supported_instruction_sets) -def test_strided(instruction_set, dtype): +@pytest.mark.parametrize('nontemporal', [False, True]) +def test_strided(instruction_set, dtype, nontemporal): f, g = ps.fields(f"f, g : {dtype}[2D]") update_rule = [ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0)] + config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set, + 'nontemporal': nontemporal}, + default_number_float=dtype) if 'storeS' not in get_vector_instruction_set(dtype, instruction_set) \ and instruction_set not in ['avx512', 'avx512vl', 'rvv'] and not instruction_set.startswith('sve'): with pytest.warns(UserWarning) as warn: - config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set}, - default_number_float=dtype) ast = ps.create_kernel(update_rule, config=config) assert 'Could not vectorize loop' in warn[0].message.args[0] else: with pytest.warns(None) as warn: - config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set}, - default_number_float=dtype) ast = ps.create_kernel(update_rule, config=config) assert len(warn) == 0 -- GitLab