diff --git a/pystencils/backends/arm_instruction_sets.py b/pystencils/backends/arm_instruction_sets.py index 5a3703c7c0efa07021b5047e5d7b9dd09acad797..33713c5b8396f86edae4d1be9150709bcf227f2f 100644 --- a/pystencils/backends/arm_instruction_sets.py +++ b/pystencils/backends/arm_instruction_sets.py @@ -85,5 +85,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q result['&'] = f'vand{q_reg}_u{bits[data_type]}' + '({0}, {1})' result['|'] = f'vorr{q_reg}_u{bits[data_type]}' + '({0}, {1})' result['blendv'] = f'vbsl{q_reg}_f{bits[data_type]}' + '({2}, {1}, {0})' + result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0' + result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff' return result diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py index 9603d6d232bd677b1e6ca0cb7ec147773b688fdb..2a15ef74f9ee5a528538d5976cd4a40d55426c82 100644 --- a/pystencils/backends/cbackend.py +++ b/pystencils/backends/cbackend.py @@ -588,18 +588,17 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): return self.instruction_set['rsqrt'].format(self._print(expr.args[0])) else: return f"({self._print(1 / sp.sqrt(expr.args[0]))})" - elif isinstance(expr, vec_any): - expr_type = get_type_of_expression(expr.args[0]) - if type(expr_type) is not VectorType: - return self._print(expr.args[0]) - else: - return self.instruction_set['any'].format(self._print(expr.args[0])) - elif isinstance(expr, vec_all): + elif isinstance(expr, vec_any) or isinstance(expr, vec_all): + instr = 'any' if isinstance(expr, vec_any) else 'all' expr_type = get_type_of_expression(expr.args[0]) if type(expr_type) is not VectorType: return self._print(expr.args[0]) else: - return self.instruction_set['all'].format(self._print(expr.args[0])) + if isinstance(expr.args[0], sp.Rel): + op = expr.args[0].rel_op + if (instr, op) in self.instruction_set: + return self.instruction_set[(instr, op)].format(*[self._print(a) for a in expr.args[0].args]) + return self.instruction_set[instr].format(self._print(expr.args[0])) return super(VectorizedCustomSympyPrinter, self)._print_Function(expr) diff --git a/pystencils/backends/ppc_instruction_sets.py b/pystencils/backends/ppc_instruction_sets.py index e3421e589e555f639c01274fd7bd99991b671247..72323f1cbceb72405b3ed88bf97411a231e4c47d 100644 --- a/pystencils/backends/ppc_instruction_sets.py +++ b/pystencils/backends/ppc_instruction_sets.py @@ -41,6 +41,19 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): '&': 'and[0, 1]', '|': 'or[0, 1]', 'blendv': 'sel[0, 1, 2]', + + ('any', '=='): 'any_eq[0, 1]', + ('any', '!='): 'any_ne[0, 1]', + ('any', '<='): 'any_le[0, 1]', + ('any', '<'): 'any_lt[0, 1]', + ('any', '>='): 'any_ge[0, 1]', + ('any', '>'): 'any_gt[0, 1]', + ('all', '=='): 'all_eq[0, 1]', + ('all', '!='): 'all_ne[0, 1]', + ('all', '<='): 'all_le[0, 1]', + ('all', '<'): 'all_lt[0, 1]', + ('all', '>='): 'all_ge[0, 1]', + ('all', '>'): 'all_gt[0, 1]', } bits = {'double': 64, @@ -74,4 +87,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): result['makeVecConstInt'] = '((' + result['int'] + '){{' + ", ".join(['{0}' for _ in range(intwidth)]) + '}})' result['makeVecInt'] = '((' + result['int'] + '){{{0}, {1}, {2}, {3}}})' + result['any'] = 'vec_any_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))' + result['all'] = 'vec_all_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))' + return result diff --git a/pystencils/backends/x86_instruction_sets.py b/pystencils/backends/x86_instruction_sets.py index 57164d6789619903ddf133f2dc78848f8fccb112..836ffc57906b503c03164715fbed6e19aabfe760 100644 --- a/pystencils/backends/x86_instruction_sets.py +++ b/pystencils/backends/x86_instruction_sets.py @@ -137,11 +137,11 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): result['double'] = f"__m{bit_width}d" result['float'] = f"__m{bit_width}" result['int'] = f"__m{bit_width}i" - result['bool'] = f"__m{bit_width}d" + result['bool'] = result[data_type] result['headers'] = headers[instruction_set] result['any'] = f"{pre}_movemask_{suf}({{0}}) > 0" - result['all'] = f"{pre}_movemask_{suf}({{0}}) == 0xF" + result['all'] = f"{pre}_movemask_{suf}({{0}}) == {hex(2**result['width']-1)}" if instruction_set == 'avx512': size = 8 if data_type == 'double' else 16 diff --git a/pystencils_tests/test_conditional_vec.py b/pystencils_tests/test_conditional_vec.py index dcad1e1c716a117c2489a8e6b91d6abf9c54c084..59f6367cd6385ea57a596ddcf1f1cf27b4bcf85c 100644 --- a/pystencils_tests/test_conditional_vec.py +++ b/pystencils_tests/test_conditional_vec.py @@ -4,18 +4,19 @@ import pytest import pystencils as ps from pystencils.astnodes import Block, Conditional -from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets +from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set from pystencils.cpu.vectorization import vec_all, vec_any +supported_instruction_sets = get_supported_instruction_sets() if get_supported_instruction_sets() else [] -@pytest.mark.skipif(not get_supported_instruction_sets(), reason='cannot detect CPU instruction set') -@pytest.mark.skipif('neon' in get_supported_instruction_sets(), reason='ARM does not have collective instructions') -@pytest.mark.xfail('vsx' in get_supported_instruction_sets(), reason='PPC collective instructions not implemented') -def test_vec_any(): - data_arr = np.zeros((15, 15)) +@pytest.mark.parametrize('instruction_set', supported_instruction_sets) +@pytest.mark.parametrize('dtype', ('float', 'double')) +def test_vec_any(instruction_set, dtype): + width = get_vector_instruction_set(dtype, instruction_set)['width'] + data_arr = np.zeros((4*width, 4*width), dtype=np.float64 if dtype == 'double' else np.float32) - data_arr[3:9, 1] = 1.0 - data = ps.fields("data: double[2D]", data=data_arr) + data_arr[3:9, 1:3*width-1] = 1.0 + data = ps.fields(f"data: {dtype}[2D]", data=data_arr) c = [ ps.Assignment(sp.Symbol("t1"), vec_any(data.center() > 0.0)), @@ -23,25 +24,21 @@ def test_vec_any(): ps.Assignment(data.center(), 2.0) ])) ] - instruction_set = get_supported_instruction_sets()[-1] ast = ps.create_kernel(c, target='cpu', cpu_vectorize_info={'instruction_set': instruction_set}) kernel = ast.compile() kernel(data=data_arr) + np.testing.assert_equal(data_arr[3:9, :3*width], 2.0) - width = ast.instruction_set['width'] - np.testing.assert_equal(data_arr[3:9, 0:width], 2.0) +@pytest.mark.parametrize('instruction_set', supported_instruction_sets) +@pytest.mark.parametrize('dtype', ('float', 'double')) +def test_vec_all(instruction_set, dtype): + width = get_vector_instruction_set(dtype, instruction_set)['width'] + data_arr = np.zeros((4*width, 4*width), dtype=np.float64 if dtype == 'double' else np.float32) - -@pytest.mark.skipif(not get_supported_instruction_sets(), reason='cannot detect CPU instruction set') -@pytest.mark.skipif('neon' in get_supported_instruction_sets(), reason='ARM does not have collective instructions') -@pytest.mark.xfail('vsx' in get_supported_instruction_sets(), reason='PPC collective instructions not implemented') -def test_vec_all(): - data_arr = np.zeros((15, 15)) - - data_arr[3:9, 2:7] = 1.0 - data = ps.fields("data: double[2D]", data=data_arr) + data_arr[3:9, 1:3*width-1] = 1.0 + data = ps.fields(f"data: {dtype}[2D]", data=data_arr) c = [ Conditional(vec_all(data.center() > 0.0), Block([ @@ -49,14 +46,17 @@ def test_vec_all(): ])) ] ast = ps.create_kernel(c, target='cpu', - cpu_vectorize_info={'instruction_set': get_supported_instruction_sets()[-1]}) + cpu_vectorize_info={'instruction_set': instruction_set}) kernel = ast.compile() - before = data_arr.copy() kernel(data=data_arr) - np.testing.assert_equal(data_arr, before) + np.testing.assert_equal(data_arr[3:9, :1], 0.0) + np.testing.assert_equal(data_arr[3:9, 1:width], 1.0) + np.testing.assert_equal(data_arr[3:9, width:2*width], 2.0) + np.testing.assert_equal(data_arr[3:9, 2*width:3*width-1], 1.0) + np.testing.assert_equal(data_arr[3:9, 3*width-1:], 0.0) -@pytest.mark.skipif(not get_supported_instruction_sets(), reason='cannot detect CPU instruction set') +@pytest.mark.skipif(not supported_instruction_sets, reason='cannot detect CPU instruction set') def test_boolean_before_loop(): t1, t2 = sp.symbols('t1, t2') f_arr = np.ones((10, 10)) @@ -68,7 +68,7 @@ def test_boolean_before_loop(): ps.Assignment(g[0, 0], sp.Piecewise((f[0, 0], t1), (42, True))) ] - ast = ps.create_kernel(a, cpu_vectorize_info={'instruction_set': get_supported_instruction_sets()[-1]}) + ast = ps.create_kernel(a, cpu_vectorize_info={'instruction_set': supported_instruction_sets[-1]}) kernel = ast.compile() kernel(f=f_arr, g=g_arr, t2=1.0) print(g)