diff --git a/src/pystencils/backends/arm_instruction_sets.py b/src/pystencils/backends/arm_instruction_sets.py index 5e650c035b3bbdc9e6e707eb857aab7403985424..227224f4e65460a291bd3a6cd3309ed3525072fa 100644 --- a/src/pystencils/backends/arm_instruction_sets.py +++ b/src/pystencils/backends/arm_instruction_sets.py @@ -110,8 +110,8 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): result['loadS'] = f'svld1_gather_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \ vindex.format("{1}") + ')' if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'): - result['streamS'] = f'svstnt1_scatter_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \ - vindex.format("{2}") + ', {1})' + result['streamS'] = f'svstnt1_scatter_u{bits[data_type]}offset_f{bits[data_type]}({predicate}, {{0}}, ' + \ + vindex.format(f"{{2}}*{bits[data_type]//8}") + ', {1})' result['+int'] = f"svadd_s{bits['int']}_x({int_predicate}, " + "{0}, {1})" diff --git a/src/pystencils/backends/cbackend.py b/src/pystencils/backends/cbackend.py index 8f4d061786f09b9d6d479944944803d178abae11..a3853198b600db5c66dbb1da97d6807782fe3373 100644 --- a/src/pystencils/backends/cbackend.py +++ b/src/pystencils/backends/cbackend.py @@ -280,15 +280,22 @@ class CBackend: if type(lhs_type) is VectorType and isinstance(node.lhs, CastFunc): arg, data_type, aligned, nontemporal, mask, stride = node.lhs.args instr = 'storeU' - if aligned: + if nontemporal and 'storeA' not in self._vector_instruction_set and \ + 'stream' in self._vector_instruction_set: + instr = 'stream' + elif aligned: instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA' if mask != True: # NOQA instr = 'maskStream' if nontemporal and 'maskStream' in self._vector_instruction_set else \ 'maskStoreA' if aligned else 'maskStoreU' if instr not in self._vector_instruction_set: - self._vector_instruction_set[instr] = self._vector_instruction_set['store' + instr[-1]].format( + store = 'store' + instr[-1] + store = store if store in self._vector_instruction_set else 'storeU' + load = 'load' + instr[-1] + load = load if load in self._vector_instruction_set else 'loadU' + self._vector_instruction_set[instr] = self._vector_instruction_set[store].format( '{0}', self._vector_instruction_set['blendv'].format( - self._vector_instruction_set['load' + instr[-1]].format('{0}', **self._kwargs), + self._vector_instruction_set[load].format('{0}', **self._kwargs), '{1}', '{2}', **self._kwargs), **self._kwargs) printed_mask = self.sympy_printer.doprint(mask) if data_type.base_type.c_name == 'double': @@ -314,12 +321,13 @@ class CBackend: if stride != 1: instr = ('maskStreamS' if nontemporal and 'maskStreamS' in self._vector_instruction_set else - 'maskStoreS') if mask != True else ('streamS' if nontemporal else 'storeS') # NOQA + 'maskStoreS') if mask != True else \ + ('streamS' if nontemporal and 'streamS' in self._vector_instruction_set else 'storeS') # NOQA return self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs), stride, printed_mask, **self._kwargs) + ';' pre_code = '' - if nontemporal and 'cachelineZero' in self._vector_instruction_set: + if nontemporal and 'cachelineZero' in self._vector_instruction_set and mask == True: # NOQA first_cond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0" offset = sp.Add(*[sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i)) * node.lhs.args[0].field.spatial_strides[i] for i in @@ -339,15 +347,20 @@ class CBackend: code2 = self._vector_instruction_set['flushCacheline'].format( ptr, self.sympy_printer.doprint(rhs), **self._kwargs) + ';' code = f"{code}\nif ({flushcond}) {{\n\t{code2}\n}}" - elif nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set: + elif aligned and nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set: lhs_hash = hashlib.sha1(self.sympy_printer.doprint(node.lhs).encode('ascii')).hexdigest()[:8] rhs_hash = hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8] tmpvar = f'_tmp_{lhs_hash}_{rhs_hash}' code = 'const ' + self._print(node.lhs.dtype).replace(' const', '') + ' ' + tmpvar + ' = ' \ + self.sympy_printer.doprint(rhs) + ';' code1 = self._vector_instruction_set[instr].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';' - code2 = self._vector_instruction_set['storeAAndFlushCacheline'].format(ptr, tmpvar, printed_mask, - **self._kwargs) + ';' + instr2 = 'maskStoreAAndFlushCacheline' if mask != True else 'storeAAndFlushCacheline' # NOQA + if instr2 not in self._vector_instruction_set: + self._vector_instruction_set[instr2] = self._vector_instruction_set['storeAAndFlushCacheline'] \ + .format('{0}', self._vector_instruction_set['blendv'].format( + self._vector_instruction_set['loadA'].format('{0}', **self._kwargs), + '{1}', '{2}', **self._kwargs), **self._kwargs) + code2 = self._vector_instruction_set[instr2].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';' code += f"\nif ({flushcond}) {{\n\t{code2}\n}} else {{\n\t{code1}\n}}" return pre_code + code else: diff --git a/src/pystencils/backends/riscv_instruction_sets.py b/src/pystencils/backends/riscv_instruction_sets.py index e456c2b8c05facda6f422d5061832331979c889f..27f631e7f92d25e366bc767c759697ac898f3308 100644 --- a/src/pystencils/backends/riscv_instruction_sets.py +++ b/src/pystencils/backends/riscv_instruction_sets.py @@ -34,7 +34,7 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'): 'maskStoreU': f'se{bits[data_type]}_v[2, 0, 1]', 'loadS': f'lse{bits[data_type]}_v[0, 1]', 'storeS': f'sse{bits[data_type]}_v[0, 2, 1]', - 'maskStoreS': f'sse{bits[data_type]}_v[2, 0, 3, 1]', + 'maskStoreS': f'sse{bits[data_type]}_v[3, 0, 2, 1]', 'abs': 'fabs_v[0]', '==': 'mfeq_vv[0, 1]', @@ -89,7 +89,7 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'): result['storeS'] = result['storeS'].replace('{2}', f'{{2}}*{bits[data_type]//8}') result['loadS'] = result['loadS'].replace('{1}', f'{{1}}*{bits[data_type]//8}') - result['maskStoreS'] = result['maskStoreS'].replace('{3}', f'{{3}}*{bits[data_type]//8}') + result['maskStoreS'] = result['maskStoreS'].replace('{2}', f'{{2}}*{bits[data_type]//8}') result['+int'] = f"vadd_vv_i{bits['int']}m1({{0}}, {{1}}, {int_vl})" diff --git a/tests/test_conditional_vec.py b/tests/test_conditional_vec.py index 84d67f0cfe939b2a640889ec23551fd86c857ee3..0118178d51ca5ff5ebbd136a7ac29c3e6ca399c0 100644 --- a/tests/test_conditional_vec.py +++ b/tests/test_conditional_vec.py @@ -3,6 +3,7 @@ import sympy as sp import pytest import pystencils as ps +from pystencils.alignedarray import aligned_zeros from pystencils.astnodes import Block, Conditional, SympyAssignment from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set from pystencils.enums import Target @@ -94,16 +95,60 @@ def test_boolean_before_loop(): @pytest.mark.parametrize('instruction_set', supported_instruction_sets) @pytest.mark.parametrize('dtype', ('float32', 'float64')) -def test_vec_maskstore(instruction_set, dtype): - data_arr = np.zeros((16, 16), dtype=dtype) +@pytest.mark.parametrize('nontemporal', [False, True]) +@pytest.mark.parametrize('aligned', [False, True]) +def test_vec_maskstore(instruction_set, dtype, nontemporal, aligned): + data_arr = (aligned_zeros if aligned else np.zeros)((16, 16), dtype=dtype) data_arr[3:-3, 3:-3] = 1.0 data = ps.fields(f"data: {dtype}[2D]", data=data_arr) c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))] assignmets = NodeCollection(c) - config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set}, default_number_float=dtype) + config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set, + 'nontemporal': nontemporal, + 'assume_aligned': aligned}, + default_number_float=dtype) ast = ps.create_kernel(assignmets, config=config) + if 'maskStore' in ast.instruction_set: + instruction = 'maskStream' if nontemporal and 'maskStream' in ast.instruction_set else ( + 'maskStoreA' if aligned and 'maskStoreA' in ast.instruction_set else 'maskStore') + assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast) + print(ps.get_code_str(ast)) + kernel = ast.compile() + kernel(data=data_arr) + np.testing.assert_equal(data_arr[:3, :], 2.0) + np.testing.assert_equal(data_arr[-3:, :], 2.0) + np.testing.assert_equal(data_arr[:, :3], 2.0) + np.testing.assert_equal(data_arr[:, -3:], 2.0) + np.testing.assert_equal(data_arr[3:-3, 3:-3], 1.0) + + +@pytest.mark.parametrize('instruction_set', supported_instruction_sets) +@pytest.mark.parametrize('dtype', ('float32', 'float64')) +@pytest.mark.parametrize('nontemporal', [False, True]) +def test_vec_maskscatter(instruction_set, dtype, nontemporal): + data_arr = np.zeros((16, 16), dtype=dtype) + data_arr[3:-3, 3:-3] = 1.0 + data = ps.fields(f"data: {dtype}[2D]") + + c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))] + + assignmets = NodeCollection(c) + config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set, + 'nontemporal': nontemporal}, + default_number_float=dtype) + if 'maskStoreS' not in get_vector_instruction_set(dtype, instruction_set) \ + and not instruction_set.startswith('sve'): + with pytest.warns(UserWarning) as warn: + ast = ps.create_kernel(assignmets, config=config) + assert 'Could not vectorize loop' in warn[0].message.args[0] + else: + with pytest.warns(None) as warn: + ast = ps.create_kernel(assignmets, config=config) + assert len(warn) == 0 + instruction = 'maskStreamS' if nontemporal and 'maskStreamS' in ast.instruction_set else 'maskStoreS' + assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast) print(ps.get_code_str(ast)) kernel = ast.compile() kernel(data=data_arr) diff --git a/tests/test_vectorization.py b/tests/test_vectorization.py index 7d487062eceac9a48071a355b1b90d8cb95113b5..729da033b9c6bca97af5466310fb2aff8dd5b43f 100644 --- a/tests/test_vectorization.py +++ b/tests/test_vectorization.py @@ -143,7 +143,7 @@ def test_aligned_and_nt_stores(openmp, instruction_set=instruction_set): # Without the base pointer spec, the inner store is not aligned config = pystencils.config.CreateKernelConfig(target=dh.default_target, cpu_vectorize_info=opt, cpu_openmp=openmp) ast = ps.create_kernel(update_rule, config=config) - if instruction_set in ['sse'] or instruction_set.startswith('avx'): + if instruction_set in ['sse'] or instruction_set.startswith('avx') or instruction_set.startswith('sve'): assert 'stream' in ast.instruction_set assert 'streamFence' in ast.instruction_set if instruction_set in ['neon', 'vsx', 'rvv']: diff --git a/tests/test_vectorization_specific.py b/tests/test_vectorization_specific.py index 8a49fa7cdf68f9456a55d3610cb9585001e90ed5..84fec99dd549e2e18411e08bb7f8a88d7561b3a9 100644 --- a/tests/test_vectorization_specific.py +++ b/tests/test_vectorization_specific.py @@ -94,6 +94,11 @@ def test_strided(instruction_set, dtype, nontemporal): with pytest.warns(None) as warn: ast = ps.create_kernel(update_rule, config=config) assert len(warn) == 0 + instruction = 'streamS' if nontemporal and 'streamS' in ast.instruction_set else 'storeS' + assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast) + instruction = 'cachelineZero' + if instruction in ast.instruction_set: + assert ast.instruction_set[instruction] not in ps.get_code_str(ast) # ps.show_code(ast) func = ast.compile()