Skip to content
Snippets Groups Projects
Commit 7f28040a authored by Markus Holzer's avatar Markus Holzer
Browse files

Merge branch 'unaligned' into 'master'

Remove loadA/storeA from ISAs where it is the same as loadU/storeU

See merge request pycodegen/pystencils!354
parents bb7a3cf2 7a1a4415
No related merge requests found
...@@ -35,9 +35,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): ...@@ -35,9 +35,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
'sqrt': 'sqrt[0]', 'sqrt': 'sqrt[0]',
'loadU': 'ld1[0]', 'loadU': 'ld1[0]',
'loadA': 'ld1[0]',
'storeU': 'st1[0, 1]', 'storeU': 'st1[0, 1]',
'storeA': 'st1[0, 1]',
'abs': 'abs[0]', 'abs': 'abs[0]',
'==': f'{cmp}eq[0, 1]', '==': f'{cmp}eq[0, 1]',
...@@ -123,7 +121,6 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): ...@@ -123,7 +121,6 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
result['all'] = f'svcntp_b{bits[data_type]}({predicate}, {{0}}) == {width}' result['all'] = f'svcntp_b{bits[data_type]}({predicate}, {{0}}) == {width}'
result['maskStoreU'] = result['storeU'].replace(predicate, '{2}') result['maskStoreU'] = result['storeU'].replace(predicate, '{2}')
result['maskStoreA'] = result['storeA'].replace(predicate, '{2}')
result['maskStoreS'] = result['storeS'].replace(predicate, '{3}') result['maskStoreS'] = result['storeS'].replace(predicate, '{3}')
if instruction_set != 'sve': if instruction_set != 'sve':
......
...@@ -30,11 +30,8 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'): ...@@ -30,11 +30,8 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
'sqrt': 'fsqrt_v[0]', 'sqrt': 'fsqrt_v[0]',
'loadU': f'le{bits[data_type]}_v[0]', 'loadU': f'le{bits[data_type]}_v[0]',
'loadA': f'le{bits[data_type]}_v[0]',
'storeU': f'se{bits[data_type]}_v[0, 1]', 'storeU': f'se{bits[data_type]}_v[0, 1]',
'storeA': f'se{bits[data_type]}_v[0, 1]',
'maskStoreU': f'se{bits[data_type]}_v[2, 0, 1]', 'maskStoreU': f'se{bits[data_type]}_v[2, 0, 1]',
'maskStoreA': f'se{bits[data_type]}_v[2, 0, 1]',
'loadS': f'lse{bits[data_type]}_v[0, 1]', 'loadS': f'lse{bits[data_type]}_v[0, 1]',
'storeS': f'sse{bits[data_type]}_v[0, 2, 1]', 'storeS': f'sse{bits[data_type]}_v[0, 2, 1]',
'maskStoreS': f'sse{bits[data_type]}_v[2, 0, 3, 1]', 'maskStoreS': f'sse{bits[data_type]}_v[2, 0, 3, 1]',
......
...@@ -130,7 +130,7 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best', ...@@ -130,7 +130,7 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best',
if nontemporal and 'cachelineZero' in vector_is: if nontemporal and 'cachelineZero' in vector_is:
kernel_ast.use_all_written_field_sizes = True kernel_ast.use_all_written_field_sizes = True
strided = 'storeS' in vector_is and 'loadS' in vector_is strided = 'storeS' in vector_is and 'loadS' in vector_is
keep_loop_stop = '{loop_stop}' in vector_is['storeA' if assume_aligned else 'storeU'] keep_loop_stop = '{loop_stop}' in vector_is['storeA' if assume_aligned and 'storeA' in vector_is else 'storeU']
vectorize_inner_loops_and_adapt_load_stores(kernel_ast, assume_aligned, nontemporal, vectorize_inner_loops_and_adapt_load_stores(kernel_ast, assume_aligned, nontemporal,
strided, keep_loop_stop, assume_sufficient_line_padding, strided, keep_loop_stop, assume_sufficient_line_padding,
default_float_type) default_float_type)
...@@ -144,13 +144,10 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem ...@@ -144,13 +144,10 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem
inner_loops = [loop for loop in all_loops if loop.is_innermost_loop] inner_loops = [loop for loop in all_loops if loop.is_innermost_loop]
zero_loop_counters = {loop.loop_counter_symbol: 0 for loop in all_loops} zero_loop_counters = {loop.loop_counter_symbol: 0 for loop in all_loops}
assert ast_node.instruction_set,\ vector_is = ast_node.instruction_set
"The ast needs to hold information about the instruction_set for the vectorisation" assert vector_is, "The ast needs to hold information about the instruction_set for the vectorisation"
vector_width = ast_node.instruction_set['width'] vector_width = vector_is['width']
vector_int_width = ast_node.instruction_set['intwidth'] vector_int_width = vector_is['intwidth']
load_a = ast_node.instruction_set['loadA']
load_u = ast_node.instruction_set['loadU']
for loop_node in inner_loops: for loop_node in inner_loops:
loop_range = loop_node.stop - loop_node.start loop_range = loop_node.stop - loop_node.start
...@@ -180,16 +177,13 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem ...@@ -180,16 +177,13 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem
for indexed in loop_node.atoms(sp.Indexed): for indexed in loop_node.atoms(sp.Indexed):
base, index = indexed.args base, index = indexed.args
if loop_counter_symbol in index.atoms(sp.Symbol): if loop_counter_symbol in index.atoms(sp.Symbol):
if not isinstance(vector_width, int) or load_a == load_u: if 'loadA' not in vector_is and 'storeA' not in vector_is and 'maskStoreA' not in vector_is:
# When the vector width is not known during code generation, we cannot determine whether # don't need to generate the alignment check when there are no aligned load/store instructions
# the access is aligned or not. None of the current sizeless vector ISAs (SVE and RISC-V-V)
# have separate load/store instructions for aligned and unaligned, so there is no disadvantage
# to falling back to unaligned here. When new ISAs become available, this may need to be revisited.
# On sized vector ISAs that do not have separate instructions for aligned and unaligned access,
# alignment does not matter here either
aligned_access = False aligned_access = False
else: else:
if not isinstance(vector_width, int):
raise NotImplementedError('Access alignment cannot be statically determined for sizeless '
'vector ISAs')
aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) % vector_width == 0 aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) % vector_width == 0
loop_counter_is_offset = loop_counter_symbol not in (index - loop_counter_symbol).atoms() loop_counter_is_offset = loop_counter_symbol not in (index - loop_counter_symbol).atoms()
stride = sp.simplify(index.subs({loop_counter_symbol: loop_counter_symbol + 1}) - index) stride = sp.simplify(index.subs({loop_counter_symbol: loop_counter_symbol + 1}) - index)
...@@ -238,7 +232,7 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem ...@@ -238,7 +232,7 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem
substitutions.update({s[0]: s[1] for s in zip(rng.result_symbols, new_result_symbols)}) substitutions.update({s[0]: s[1] for s in zip(rng.result_symbols, new_result_symbols)})
rng._symbols_defined = set(new_result_symbols) rng._symbols_defined = set(new_result_symbols)
fast_subs(loop_node, substitutions, skip=lambda e: isinstance(e, RNGBase)) fast_subs(loop_node, substitutions, skip=lambda e: isinstance(e, RNGBase))
insert_vector_casts(loop_node, ast_node.instruction_set, default_float_type) insert_vector_casts(loop_node, vector_is, default_float_type)
def mask_conditionals(loop_body): def mask_conditionals(loop_body):
......
...@@ -119,14 +119,11 @@ def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set ...@@ -119,14 +119,11 @@ def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set
cpu_vectorize_info=opt, ghost_layers=gl_kernel) cpu_vectorize_info=opt, ghost_layers=gl_kernel)
ast = ps.create_kernel(update_rule, config=config) ast = ps.create_kernel(update_rule, config=config)
kernel = ast.compile() kernel = ast.compile()
if ast.instruction_set['loadA'] == ast.instruction_set['loadU']: if ('loadA' in ast.instruction_set or 'storeA' in ast.instruction_set) and gl_kernel != gl_field:
dh.run_kernel(kernel) with pytest.raises(ValueError):
else:
if gl_kernel != gl_field:
with pytest.raises(ValueError):
dh.run_kernel(kernel)
else:
dh.run_kernel(kernel) dh.run_kernel(kernel)
else:
dh.run_kernel(kernel)
@pytest.mark.parametrize('instruction_set', supported_instruction_sets) @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment