Skip to content
Snippets Groups Projects
Commit b1750b81 authored by Michael Kuron's avatar Michael Kuron :mortar_board:
Browse files

add fence after non-temporal stores

Fixes #25
parent b075b723
No related branches found
No related tags found
1 merge request!230Improve non-temporal stores
Pipeline #31226 passed
......@@ -324,7 +324,7 @@ class Block(Node):
node.parent = self
self._nodes.insert(0, node)
def insert_before(self, new_node, insert_before):
def insert_before(self, new_node, insert_before, if_not_exists=False):
new_node.parent = self
assert self._nodes.count(insert_before) == 1
idx = self._nodes.index(insert_before)
......@@ -337,7 +337,25 @@ class Block(Node):
idx -= 1
else:
break
self._nodes.insert(idx, new_node)
if not if_not_exists or self._nodes[idx] != new_node:
self._nodes.insert(idx, new_node)
def insert_after(self, new_node, insert_after, if_not_exists=False):
new_node.parent = self
assert self._nodes.count(insert_after) == 1
idx = self._nodes.index(insert_after) + 1
# move all assignment (definitions to the top)
if isinstance(new_node, SympyAssignment) and new_node.is_declaration:
while idx > 0:
pn = self._nodes[idx - 1]
if isinstance(pn, LoopOverCoordinate) or isinstance(pn, Conditional):
idx -= 1
else:
break
if not if_not_exists or not (self._nodes[idx - 1] == new_node
or (idx < len(self._nodes) and self._nodes[idx] == new_node)):
self._nodes.insert(idx, new_node)
def append(self, node):
if isinstance(node, list) or isinstance(node, tuple):
......@@ -816,3 +834,23 @@ class ConditionalFieldAccess(sp.Function):
def __getnewargs__(self):
return self.access, self.outofbounds_condition, self.outofbounds_value
class NontemporalFence(Node):
def __init__(self):
super(NontemporalFence, self).__init__(parent=None)
@property
def symbols_defined(self):
return set()
@property
def undefined_symbols(self):
return set()
@property
def args(self):
return []
def __eq__(self, other):
return isinstance(other, NontemporalFence)
......@@ -277,6 +277,12 @@ class CBackend:
else:
return f"{self.sympy_printer.doprint(node.lhs)} = {self.sympy_printer.doprint(node.rhs)};"
def _print_NontemporalFence(self, _):
if 'stream_fence' in self._vector_instruction_set:
return self._vector_instruction_set['stream_fence'] + ';'
else:
return ''
def _print_TemporaryMemoryAllocation(self, node):
align = 64
np_dtype = node.symbol.dtype.base_type.numpy_dtype
......
......@@ -164,4 +164,6 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
result['+int'] = f"{pre}_add_{suffix['int']}({{0}}, {{1}})"
result['stream_fence'] = '_mm_mfence()'
return result
......@@ -148,6 +148,11 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
if hasattr(indexed, 'field'):
nontemporal = (indexed.field in nontemporal_fields) or (indexed.field.name in nontemporal_fields)
substitutions[indexed] = vector_memory_access(indexed, vec_type, use_aligned_access, nontemporal, True)
if nontemporal:
parent = loop_node.parent
while type(parent.parent.parent) is not ast.KernelFunction:
parent = parent.parent
parent.parent.insert_after(ast.NontemporalFence(), parent, if_not_exists=True)
if not successful:
warnings.warn("Could not vectorize loop because of non-consecutive memory access")
continue
......
......@@ -47,6 +47,8 @@ def test_aligned_and_nt_stores():
'assume_inner_stride_one': True}
update_rule = [ps.Assignment(f.center(), 0.25 * (g[-1, 0] + g[1, 0] + g[0, -1] + g[0, 1]))]
ast = ps.create_kernel(update_rule, target=dh.default_target, cpu_vectorize_info=opt)
if 'stream_fence' in ast.instruction_set:
assert ast.instruction_set['stream_fence'] in ps.get_code_str(ast)
kernel = ast.compile()
dh.run_kernel(kernel)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment