From 7a1a4415986aa94ba48bdc2f030ba2825ee42481 Mon Sep 17 00:00:00 2001 From: Michael Kuron <m.kuron@gmx.de> Date: Tue, 19 Sep 2023 20:17:40 +0200 Subject: [PATCH] Remove loadA/storeA from ISAs where it is the same as loadU/storeU --- pystencils/backends/arm_instruction_sets.py | 3 -- pystencils/backends/riscv_instruction_sets.py | 3 -- pystencils/cpu/vectorization.py | 28 ++++++++----------- .../test_vectorization_specific.py | 11 +++----- 4 files changed, 15 insertions(+), 30 deletions(-) diff --git a/pystencils/backends/arm_instruction_sets.py b/pystencils/backends/arm_instruction_sets.py index 9aa8f6c0a..7dede78aa 100644 --- a/pystencils/backends/arm_instruction_sets.py +++ b/pystencils/backends/arm_instruction_sets.py @@ -35,9 +35,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): 'sqrt': 'sqrt[0]', 'loadU': 'ld1[0]', - 'loadA': 'ld1[0]', 'storeU': 'st1[0, 1]', - 'storeA': 'st1[0, 1]', 'abs': 'abs[0]', '==': f'{cmp}eq[0, 1]', @@ -123,7 +121,6 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): result['all'] = f'svcntp_b{bits[data_type]}({predicate}, {{0}}) == {width}' result['maskStoreU'] = result['storeU'].replace(predicate, '{2}') - result['maskStoreA'] = result['storeA'].replace(predicate, '{2}') result['maskStoreS'] = result['storeS'].replace(predicate, '{3}') if instruction_set != 'sve': diff --git a/pystencils/backends/riscv_instruction_sets.py b/pystencils/backends/riscv_instruction_sets.py index 8e0ab7edd..948a645c9 100644 --- a/pystencils/backends/riscv_instruction_sets.py +++ b/pystencils/backends/riscv_instruction_sets.py @@ -30,11 +30,8 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'): 'sqrt': 'fsqrt_v[0]', 'loadU': f'le{bits[data_type]}_v[0]', - 'loadA': f'le{bits[data_type]}_v[0]', 'storeU': f'se{bits[data_type]}_v[0, 1]', - 'storeA': f'se{bits[data_type]}_v[0, 1]', 'maskStoreU': f'se{bits[data_type]}_v[2, 0, 1]', - 'maskStoreA': f'se{bits[data_type]}_v[2, 0, 1]', 'loadS': f'lse{bits[data_type]}_v[0, 1]', 'storeS': f'sse{bits[data_type]}_v[0, 2, 1]', 'maskStoreS': f'sse{bits[data_type]}_v[2, 0, 3, 1]', diff --git a/pystencils/cpu/vectorization.py b/pystencils/cpu/vectorization.py index f39c52d81..0e2b77da6 100644 --- a/pystencils/cpu/vectorization.py +++ b/pystencils/cpu/vectorization.py @@ -130,7 +130,7 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best', if nontemporal and 'cachelineZero' in vector_is: kernel_ast.use_all_written_field_sizes = True strided = 'storeS' in vector_is and 'loadS' in vector_is - keep_loop_stop = '{loop_stop}' in vector_is['storeA' if assume_aligned else 'storeU'] + keep_loop_stop = '{loop_stop}' in vector_is['storeA' if assume_aligned and 'storeA' in vector_is else 'storeU'] vectorize_inner_loops_and_adapt_load_stores(kernel_ast, assume_aligned, nontemporal, strided, keep_loop_stop, assume_sufficient_line_padding, default_float_type) @@ -144,13 +144,10 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem inner_loops = [loop for loop in all_loops if loop.is_innermost_loop] zero_loop_counters = {loop.loop_counter_symbol: 0 for loop in all_loops} - assert ast_node.instruction_set,\ - "The ast needs to hold information about the instruction_set for the vectorisation" - vector_width = ast_node.instruction_set['width'] - vector_int_width = ast_node.instruction_set['intwidth'] - - load_a = ast_node.instruction_set['loadA'] - load_u = ast_node.instruction_set['loadU'] + vector_is = ast_node.instruction_set + assert vector_is, "The ast needs to hold information about the instruction_set for the vectorisation" + vector_width = vector_is['width'] + vector_int_width = vector_is['intwidth'] for loop_node in inner_loops: loop_range = loop_node.stop - loop_node.start @@ -180,16 +177,13 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem for indexed in loop_node.atoms(sp.Indexed): base, index = indexed.args if loop_counter_symbol in index.atoms(sp.Symbol): - if not isinstance(vector_width, int) or load_a == load_u: - # When the vector width is not known during code generation, we cannot determine whether - # the access is aligned or not. None of the current sizeless vector ISAs (SVE and RISC-V-V) - # have separate load/store instructions for aligned and unaligned, so there is no disadvantage - # to falling back to unaligned here. When new ISAs become available, this may need to be revisited. - - # On sized vector ISAs that do not have separate instructions for aligned and unaligned access, - # alignment does not matter here either + if 'loadA' not in vector_is and 'storeA' not in vector_is and 'maskStoreA' not in vector_is: + # don't need to generate the alignment check when there are no aligned load/store instructions aligned_access = False else: + if not isinstance(vector_width, int): + raise NotImplementedError('Access alignment cannot be statically determined for sizeless ' + 'vector ISAs') aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) % vector_width == 0 loop_counter_is_offset = loop_counter_symbol not in (index - loop_counter_symbol).atoms() stride = sp.simplify(index.subs({loop_counter_symbol: loop_counter_symbol + 1}) - index) @@ -238,7 +232,7 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem substitutions.update({s[0]: s[1] for s in zip(rng.result_symbols, new_result_symbols)}) rng._symbols_defined = set(new_result_symbols) fast_subs(loop_node, substitutions, skip=lambda e: isinstance(e, RNGBase)) - insert_vector_casts(loop_node, ast_node.instruction_set, default_float_type) + insert_vector_casts(loop_node, vector_is, default_float_type) def mask_conditionals(loop_body): diff --git a/pystencils_tests/test_vectorization_specific.py b/pystencils_tests/test_vectorization_specific.py index db4965755..d1930a07a 100644 --- a/pystencils_tests/test_vectorization_specific.py +++ b/pystencils_tests/test_vectorization_specific.py @@ -119,14 +119,11 @@ def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set cpu_vectorize_info=opt, ghost_layers=gl_kernel) ast = ps.create_kernel(update_rule, config=config) kernel = ast.compile() - if ast.instruction_set['loadA'] == ast.instruction_set['loadU']: - dh.run_kernel(kernel) - else: - if gl_kernel != gl_field: - with pytest.raises(ValueError): - dh.run_kernel(kernel) - else: + if ('loadA' in ast.instruction_set or 'storeA' in ast.instruction_set) and gl_kernel != gl_field: + with pytest.raises(ValueError): dh.run_kernel(kernel) + else: + dh.run_kernel(kernel) @pytest.mark.parametrize('instruction_set', supported_instruction_sets) -- GitLab