Skip to content
Snippets Groups Projects

Add SVE nontemporal stores and scatters, including masked variants

Merged Michael Kuron requested to merge sve into master
Files
3
@@ -280,9 +280,11 @@ class CBackend:
if type(lhs_type) is VectorType and isinstance(node.lhs, CastFunc):
arg, data_type, aligned, nontemporal, mask, stride = node.lhs.args
instr = 'storeU'
if aligned:
instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else \
'storeA' if 'storeA' in self._vector_instruction_set else 'storeU'
if nontemporal and 'storeA' not in self._vector_instruction_set and \
'stream' in self._vector_instruction_set:
instr = 'stream'
elif aligned:
instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA'
if mask != True: # NOQA
instr = 'maskStream' if nontemporal and 'maskStream' in self._vector_instruction_set else \
'maskStoreA' if aligned else 'maskStoreU'
@@ -319,12 +321,14 @@ class CBackend:
if stride != 1:
instr = ('maskStreamS' if nontemporal and 'maskStreamS' in self._vector_instruction_set else
'maskStoreS') if mask != True else ('streamS' if nontemporal else 'storeS') # NOQA
'maskStoreS') if mask != True else \
('streamS' if nontemporal and 'streamS' in self._vector_instruction_set else 'storeS') # NOQA
return self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs),
stride, printed_mask, **self._kwargs) + ';'
pre_code = ''
if nontemporal and 'cachelineZero' in self._vector_instruction_set:
blended = self._vector_instruction_set['blendv'].split('{')[0] in self._vector_instruction_set[instr]
if nontemporal and 'cachelineZero' in self._vector_instruction_set and not blended:
first_cond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0"
offset = sp.Add(*[sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i))
* node.lhs.args[0].field.spatial_strides[i] for i in
@@ -701,8 +705,7 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
arg, data_type, aligned, _, mask, stride = expr.args
if stride != 1:
return self.instruction_set['loadS'].format(f"& {self._print(arg)}", stride, **self._kwargs)
instruction = self.instruction_set['loadA'] if aligned and 'loadA' in self.instruction_set \
else self.instruction_set['loadU']
instruction = self.instruction_set['loadA'] if aligned else self.instruction_set['loadU']
return instruction.format(f"& {self._print(arg)}", **self._kwargs)
elif expr.func == DivFunc:
result = self._scalarFallback('_print_Function', expr)