Skip to content
Snippets Groups Projects
Commit 9c784763 authored by Michael Kuron's avatar Michael Kuron :mortar_board:
Browse files

use cache line zero on architectures without nontemporal store

parent b1750b81
Branches
Tags
1 merge request!230Improve non-temporal stores
......@@ -313,7 +313,9 @@ class Block(Node):
self._nodes = [fast_subs(a, subs_dict, skip) for a in self._nodes]
return self
def insert_front(self, node):
def insert_front(self, node, if_not_exists=False):
if if_not_exists and len(self._nodes) > 0 and self._nodes[0] == node:
return
if isinstance(node, collections.abc.Iterable):
node = list(node)
for n in node:
......@@ -854,3 +856,25 @@ class NontemporalFence(Node):
def __eq__(self, other):
return isinstance(other, NontemporalFence)
class CachelineSize(Node):
mask_symbol = sp.Symbol("_clsize_mask")
def __init__(self):
super(CachelineSize, self).__init__(parent=None)
@property
def symbols_defined(self):
return set([self.mask_symbol])
@property
def undefined_symbols(self):
return set()
@property
def args(self):
return []
def __eq__(self, other):
return isinstance(other, CachelineSize)
......@@ -81,4 +81,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0'
result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff'
result['cachelineSize'] = 'cachelineSize()'
result['cachelineZero'] = 'cachelineZero((void*) {0})'
return result
......@@ -7,7 +7,7 @@ import sympy as sp
from sympy.core import S
from sympy.logic.boolalg import BooleanFalse, BooleanTrue
from pystencils.astnodes import KernelFunction, Node
from pystencils.astnodes import KernelFunction, Node, CachelineSize
from pystencils.cpu.vectorization import vec_all, vec_any
from pystencils.data_types import (
PointerType, VectorType, address_of, cast_func, create_type, get_type_of_expression,
......@@ -271,15 +271,27 @@ class CBackend:
else:
rhs = node.rhs
return self._vector_instruction_set[instr].format("&" + self.sympy_printer.doprint(node.lhs.args[0]),
self.sympy_printer.doprint(rhs),
ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0])
pre_code = ''
if instr == 'stream' and 'cachelineZero' in self._vector_instruction_set:
pre_code = f"if (((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0) " + "\n\t" + \
self._vector_instruction_set['cachelineZero'].format(ptr) + ';\n'
code = self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs),
printed_mask) + ';'
return pre_code + code
else:
return f"{self.sympy_printer.doprint(node.lhs)} = {self.sympy_printer.doprint(node.rhs)};"
def _print_NontemporalFence(self, _):
if 'stream_fence' in self._vector_instruction_set:
return self._vector_instruction_set['stream_fence'] + ';'
if 'streamFence' in self._vector_instruction_set:
return self._vector_instruction_set['streamFence'] + ';'
else:
return ''
def _print_CachelineSize(self, node):
if 'cachelineSize' in self._vector_instruction_set:
return f'const size_t {node.mask_symbol} = {self._vector_instruction_set["cachelineSize"]} - 1;'
else:
return ''
......
......@@ -29,7 +29,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
'loadA': 'ld[0x0, 0]',
'storeU': 'xst[1, 0x0, 0]',
'storeA': 'st[1, 0x0, 0]',
'stream': 'stl[1, 0x0, 0]',
'stream': 'st[1, 0x0, 0]', # stl would flush the cacheline, which only makes sense for the last item
'abs': 'abs[0]',
'==': 'cmpeq[0, 1]',
......@@ -98,4 +98,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
result['any'] = 'vec_any_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
result['all'] = 'vec_all_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
result['cachelineSize'] = 'cachelineSize()'
result['cachelineZero'] = 'cachelineZero((void*) {0})'
return result
......@@ -164,6 +164,6 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
result['+int'] = f"{pre}_add_{suffix['int']}({{0}}, {{1}})"
result['stream_fence'] = '_mm_mfence()'
result['streamFence'] = '_mm_mfence()'
return result
......@@ -149,10 +149,13 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
nontemporal = (indexed.field in nontemporal_fields) or (indexed.field.name in nontemporal_fields)
substitutions[indexed] = vector_memory_access(indexed, vec_type, use_aligned_access, nontemporal, True)
if nontemporal:
# insert NontemporalFence after the outermost loop
parent = loop_node.parent
while type(parent.parent.parent) is not ast.KernelFunction:
parent = parent.parent
parent.parent.insert_after(ast.NontemporalFence(), parent, if_not_exists=True)
# insert CachelineSize at the beginning of the kernel
parent.parent.insert_front(ast.CachelineSize(), if_not_exists=True)
if not successful:
warnings.warn("Could not vectorize loop because of non-consecutive memory access")
continue
......
......@@ -17,3 +17,54 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d)
alignas(16) int data[4] = {a, b, c, d};
return vld1q_s32(data);
}
inline void cachelineZero(void * p) {
__asm__ volatile("dc zva, %0"::"r"(p));
}
inline size_t _cachelineSize() {
// check that dc zva is permitted
uint64_t dczid;
__asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid));
if ((dczid & (1 << 4)) != 0) {
return SIZE_MAX;
}
// allocate and fill with ones
const size_t max_size = 0x100000;
uint8_t data[2*max_size];
for (size_t i = 0; i < 2*max_size; ++i) {
data[i] = 0xff;
}
// find alignment offset
size_t offset = max_size - ((uintptr_t) data) % max_size;
// zero a cacheline
cachelineZero((void*) (data + offset));
// make sure that at least one byte was zeroed
if (data[offset] != 0) {
return SIZE_MAX;
}
// make sure that nothing was zeroed before the pointer
if (data[offset-1] == 0) {
return SIZE_MAX;
}
// find the last byte that was zeroed
for (size_t size = 1; size < max_size; ++size) {
if (data[offset + size] != 0) {
return size;
}
}
// too much was zeroed
return SIZE_MAX;
}
inline size_t cachelineSize() {
static size_t size = _cachelineSize();
return size;
}
#include <altivec.h>
#undef vector
#undef bool
inline void cachelineZero(void * p) {
#ifdef __xlC__
__dcbz(p);
#else
__asm__ volatile("dcbz 0, %0"::"r"(p):"memory");
#endif
}
inline size_t _cachelineSize() {
// allocate and fill with ones
const size_t max_size = 0x100000;
uint8_t data[2*max_size];
for (size_t i = 0; i < 2*max_size; ++i) {
data[i] = 0xff;
}
// find alignment offset
size_t offset = max_size - ((uintptr_t) data) % max_size;
// zero a cacheline
cachelineZero((void*) (data + offset));
// make sure that at least one byte was zeroed
if (data[offset] != 0) {
return SIZE_MAX;
}
// make sure that nothing was zeroed before the pointer
if (data[offset-1] == 0) {
return SIZE_MAX;
}
// find the last byte that was zeroed
for (size_t size = 1; size < max_size; ++size) {
if (data[offset + size] != 0) {
return size;
}
}
// too much was zeroed
return SIZE_MAX;
}
inline size_t cachelineSize() {
static size_t size = _cachelineSize();
return size;
}
......@@ -33,7 +33,7 @@ def test_vector_type_propagation():
np.testing.assert_equal(dst[1:-1, 1:-1], 2 * 10.0 + 3)
def test_aligned_and_nt_stores():
def test_aligned_and_nt_stores(openmp=False):
domain_size = (24, 24)
# create a datahandling object
dh = ps.create_data_handling(domain_size, periodicity=(True, True), parallel=False, default_target='cpu')
......@@ -41,19 +41,29 @@ def test_aligned_and_nt_stores():
# fields
g = dh.add_array("g", values_per_cell=1, alignment=True)
dh.fill("g", 1.0, ghost_layers=True)
f = dh.add_array("f", values_per_cell=1, alignment=True)
if openmp:
# TODO: throw error when not cacheline-aligned
alignment = 128 if instruction_set == 'vsx' else 64 if instruction_set == 'neon' else True
else:
alignment = True
f = dh.add_array("f", values_per_cell=1, alignment=alignment)
dh.fill("f", 0.0, ghost_layers=True)
opt = {'instruction_set': instruction_set, 'assume_aligned': True, 'nontemporal': True,
'assume_inner_stride_one': True}
update_rule = [ps.Assignment(f.center(), 0.25 * (g[-1, 0] + g[1, 0] + g[0, -1] + g[0, 1]))]
ast = ps.create_kernel(update_rule, target=dh.default_target, cpu_vectorize_info=opt)
if 'stream_fence' in ast.instruction_set:
assert ast.instruction_set['stream_fence'] in ps.get_code_str(ast)
ast = ps.create_kernel(update_rule, target=dh.default_target, cpu_vectorize_info=opt, cpu_openmp=openmp)
if 'streamFence' in ast.instruction_set:
assert ast.instruction_set['streamFence'] in ps.get_code_str(ast)
if 'cachelineZero' in ast.instruction_set:
assert ast.instruction_set['cachelineZero'].split('{0}')[0] in ps.get_code_str(ast)
kernel = ast.compile()
dh.run_kernel(kernel)
np.testing.assert_equal(np.sum(dh.cpu_arrays['f']), np.prod(domain_size))
def test_aligned_and_nt_stores_openmp():
test_aligned_and_nt_stores(True)
def test_inplace_update():
shape = (9, 9, 3)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment