diff --git a/pystencils/backends/ppc_instruction_sets.py b/pystencils/backends/ppc_instruction_sets.py index 72323f1cbceb72405b3ed88bf97411a231e4c47d..f4df9d3b27d7c0827a2442de92937e8d255aa610 100644 --- a/pystencils/backends/ppc_instruction_sets.py +++ b/pystencils/backends/ppc_instruction_sets.py @@ -23,7 +23,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): '*': 'mul[0, 1]', '/': 'div[0, 1]', 'sqrt': 'sqrt[0]', - 'rsqrt': 'rsqrt[0]', + 'rsqrt': 'rsqrte[0]', # rsqrt is available too, but not on Clang 'loadU': 'xl[0x0, 0]', 'loadA': 'ld[0x0, 0]', @@ -73,6 +73,12 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): result[intrinsic_id] = 'vec_' + name + arg_string + if data_type == 'double': + # Clang and XL C++ are missing these for doubles + result['loadA'] = '(__vector double)' + result['loadA'].format('(float*) {0}') + result['storeA'] = result['storeA'].format('(float*) {0}', '(__vector float) {1}') + result['stream'] = result['stream'].format('(float*) {0}', '(__vector float) {1}') + result['+int'] = "vec_add({0}, {1})" result['width'] = width @@ -82,10 +88,12 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): result['bool'] = f'__vector __bool {"long long" if data_type == "double" else "int"}' result['headers'] = ['<altivec.h>', '"ppc_altivec_helpers.h"'] - result['makeVecConst'] = '((' + result[data_type] + '){{' + ", ".join(['{0}' for _ in range(width)]) + '}})' - result['makeVec'] = '((' + result[data_type] + '){{' + ", ".join(['{' + str(i) + '}' for i in range(width)]) + '}})' - result['makeVecConstInt'] = '((' + result['int'] + '){{' + ", ".join(['{0}' for _ in range(intwidth)]) + '}})' - result['makeVecInt'] = '((' + result['int'] + '){{{0}, {1}, {2}, {3}}})' + result['makeVecConst'] = '((' + result[data_type] + '){{' + \ + ", ".join(['(' + data_type + ') {0}' for _ in range(width)]) + '}})' + result['makeVec'] = '((' + result[data_type] + '){{' + \ + ", ".join(['{' + data_type + '} {' + str(i) + '}' for i in range(width)]) + '}})' + result['makeVecConstInt'] = '((' + result['int'] + '){{' + ", ".join(['(int) {0}' for _ in range(intwidth)]) + '}})' + result['makeVecInt'] = '((' + result['int'] + '){{(int) {0}, (int) {1}, (int) {2}, (int) {3}}})' result['any'] = 'vec_any_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))' result['all'] = 'vec_all_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))' diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py index b3418eb3083f804338d804b1cb0fa709852ddbea..850f8ff6d4a9ae168a78c6588a69fd87f5e5f03e 100644 --- a/pystencils/backends/simd_instruction_sets.py +++ b/pystencils/backends/simd_instruction_sets.py @@ -14,19 +14,28 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'): return get_vector_instruction_set_x86(data_type, instruction_set) +_cache = None + + def get_supported_instruction_sets(): """List of supported instruction sets on current hardware, or None if query failed.""" + global _cache + if _cache is not None: + return _cache.copy() if platform.system() == 'Darwin' and platform.machine() == 'arm64': # not supported by cpuinfo return ['neon'] elif platform.machine().startswith('ppc64'): # no flags reported by cpuinfo import subprocess + import tempfile from pystencils.cpu.cpujit import get_compiler_config - command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', '-'] + f = tempfile.NamedTemporaryFile(suffix='.cpp') + command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', f.name] macros = subprocess.check_output(command, input='', text=True) if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros: - return ['vsx'] + _cache = ['vsx'] else: - return [] + _cache = [] + return _cache.copy() try: from cpuinfo import get_cpu_info except ImportError: diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h index 2cc953c4a1621db3123ab6881d40ba96075c9ef0..1b8b9d9f6ab5cb694deb7508014a5c0d080fbb6b 100644 --- a/pystencils/include/philox_rand.h +++ b/pystencils/include/philox_rand.h @@ -296,29 +296,29 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ct #ifdef __ALTIVEC__ -QUALIFIERS void _philox4x32round(__vector uint32* ctr, __vector uint32* key) +QUALIFIERS void _philox4x32round(__vector unsigned int* ctr, __vector unsigned int* key) { #ifdef __POWER10_VECTOR__ - __vector uint32 lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0)); - __vector uint32 lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1)); - __vector uint32 hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0)); - __vector uint32 hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1)); + __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0)); + __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1)); + __vector unsigned int hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0)); + __vector unsigned int hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1)); #else - __vector uint32 lohi0a = (__vector uint32) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0)); - __vector uint32 lohi0b = (__vector uint32) vec_mulo(ctr[0], vec_splats(PHILOX_M4x32_0)); - __vector uint32 lohi1a = (__vector uint32) vec_mule(ctr[2], vec_splats(PHILOX_M4x32_1)); - __vector uint32 lohi1b = (__vector uint32) vec_mulo(ctr[2], vec_splats(PHILOX_M4x32_1)); + __vector unsigned int lohi0a = (__vector unsigned int) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0)); + __vector unsigned int lohi0b = (__vector unsigned int) vec_mulo(ctr[0], vec_splats(PHILOX_M4x32_0)); + __vector unsigned int lohi1a = (__vector unsigned int) vec_mule(ctr[2], vec_splats(PHILOX_M4x32_1)); + __vector unsigned int lohi1b = (__vector unsigned int) vec_mulo(ctr[2], vec_splats(PHILOX_M4x32_1)); #ifdef __LITTLE_ENDIAN__ - __vector uint32 lo0 = vec_mergee(lohi0a, lohi0b); - __vector uint32 lo1 = vec_mergee(lohi1a, lohi1b); - __vector uint32 hi0 = vec_mergeo(lohi0a, lohi0b); - __vector uint32 hi1 = vec_mergeo(lohi1a, lohi1b); + __vector unsigned int lo0 = vec_mergee(lohi0a, lohi0b); + __vector unsigned int lo1 = vec_mergee(lohi1a, lohi1b); + __vector unsigned int hi0 = vec_mergeo(lohi0a, lohi0b); + __vector unsigned int hi1 = vec_mergeo(lohi1a, lohi1b); #else - __vector uint32 lo0 = vec_mergeo(lohi0a, lohi0b); - __vector uint32 lo1 = vec_mergeo(lohi1a, lohi1b); - __vector uint32 hi0 = vec_mergee(lohi0a, lohi0b); - __vector uint32 hi1 = vec_mergee(lohi1a, lohi1b); + __vector unsigned int lo0 = vec_mergeo(lohi0a, lohi0b); + __vector unsigned int lo1 = vec_mergeo(lohi1a, lohi1b); + __vector unsigned int hi0 = vec_mergee(lohi0a, lohi0b); + __vector unsigned int hi1 = vec_mergee(lohi1a, lohi1b); #endif #endif @@ -328,7 +328,7 @@ QUALIFIERS void _philox4x32round(__vector uint32* ctr, __vector uint32* key) ctr[3] = lo0; } -QUALIFIERS void _philox4x32bumpkey(__vector uint32* key) +QUALIFIERS void _philox4x32bumpkey(__vector unsigned int* key) { key[0] = vec_add(key[0], vec_splats(PHILOX_W32_0)); key[1] = vec_add(key[1], vec_splats(PHILOX_W32_1)); @@ -336,7 +336,7 @@ QUALIFIERS void _philox4x32bumpkey(__vector uint32* key) #ifdef __VSX__ template<bool high> -QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32 y) +QUALIFIERS __vector double _uniform_double_hq(__vector unsigned int x, __vector unsigned int y) { // convert 32 to 64 bit #ifdef __LITTLE_ENDIAN__ @@ -364,16 +364,14 @@ QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32 #endif // calculate z = x ^ y << (53 - 32)) - __vector uint64 z = vec_sl((__vector uint64) y, vec_splats(53ULL - 32ULL)); - z = vec_xor((__vector uint64) x, z); + __vector unsigned long long z = vec_sl((__vector unsigned long long) y, vec_splats(53ULL - 32ULL)); + z = vec_xor((__vector unsigned long long) x, z); // convert uint64 to double -#if defined(__has_builtin) && __has_builtin(__builtin_convertvector) - __vector double rs = __builtin_convertvector(z, __vector double); -#elif defined(__GNUC__) && __GNUC__ >= 8 - __vector double rs = vec_ctf(z, 0); -#else +#ifdef __ibmxl__ __vector double rs = vec_ctd(z, 0); +#else + __vector double rs = vec_ctf(z, 0); #endif // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0) rs = vec_madd(rs, vec_splats(TWOPOW53_INV_DOUBLE), vec_splats(TWOPOW53_INV_DOUBLE/2.0)); @@ -383,12 +381,12 @@ QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32 #endif -QUALIFIERS void philox_float4(__vector uint32 ctr0, __vector uint32 ctr1, __vector uint32 ctr2, __vector uint32 ctr3, +QUALIFIERS void philox_float4(__vector unsigned int ctr0, __vector unsigned int ctr1, __vector unsigned int ctr2, __vector unsigned int ctr3, uint32 key0, uint32 key1, __vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4) { - __vector uint32 key[2] = {vec_splats(key0), vec_splats(key1)}; - __vector uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3}; + __vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)}; + __vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3}; _philox4x32round(ctr, key); // 1 _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2 _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3 @@ -414,12 +412,12 @@ QUALIFIERS void philox_float4(__vector uint32 ctr0, __vector uint32 ctr1, __vect #ifdef __VSX__ -QUALIFIERS void philox_double2(__vector uint32 ctr0, __vector uint32 ctr1, __vector uint32 ctr2, __vector uint32 ctr3, +QUALIFIERS void philox_double2(__vector unsigned int ctr0, __vector unsigned int ctr1, __vector unsigned int ctr2, __vector unsigned int ctr3, uint32 key0, uint32 key1, __vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi) { - __vector uint32 key[2] = {vec_splats(key0), vec_splats(key1)}; - __vector uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3}; + __vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)}; + __vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3}; _philox4x32round(ctr, key); // 1 _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2 _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3 @@ -438,13 +436,13 @@ QUALIFIERS void philox_double2(__vector uint32 ctr0, __vector uint32 ctr1, __vec } #endif -QUALIFIERS void philox_float4(uint32 ctr0, __vector uint32 ctr1, uint32 ctr2, uint32 ctr3, +QUALIFIERS void philox_float4(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, __vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4) { - __vector uint32 ctr0v = vec_splats(ctr0); - __vector uint32 ctr2v = vec_splats(ctr2); - __vector uint32 ctr3v = vec_splats(ctr3); + __vector unsigned int ctr0v = vec_splats(ctr0); + __vector unsigned int ctr2v = vec_splats(ctr2); + __vector unsigned int ctr3v = vec_splats(ctr3); philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4); } @@ -453,28 +451,28 @@ QUALIFIERS void philox_float4(uint32 ctr0, __vector int ctr1, uint32 ctr2, uint3 uint32 key0, uint32 key1, __vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4) { - philox_float4(ctr0, (__vector uint32) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4); + philox_float4(ctr0, (__vector unsigned int) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4); } #ifdef __VSX__ -QUALIFIERS void philox_double2(uint32 ctr0, __vector uint32 ctr1, uint32 ctr2, uint32 ctr3, +QUALIFIERS void philox_double2(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, __vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi) { - __vector uint32 ctr0v = vec_splats(ctr0); - __vector uint32 ctr2v = vec_splats(ctr2); - __vector uint32 ctr3v = vec_splats(ctr3); + __vector unsigned int ctr0v = vec_splats(ctr0); + __vector unsigned int ctr2v = vec_splats(ctr2); + __vector unsigned int ctr3v = vec_splats(ctr3); philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi); } -QUALIFIERS void philox_double2(uint32 ctr0, __vector uint32 ctr1, uint32 ctr2, uint32 ctr3, +QUALIFIERS void philox_double2(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, __vector double & rnd1, __vector double & rnd2) { - __vector uint32 ctr0v = vec_splats(ctr0); - __vector uint32 ctr2v = vec_splats(ctr2); - __vector uint32 ctr3v = vec_splats(ctr3); + __vector unsigned int ctr0v = vec_splats(ctr0); + __vector unsigned int ctr2v = vec_splats(ctr2); + __vector unsigned int ctr3v = vec_splats(ctr3); __vector double ignore; philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore); @@ -484,7 +482,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, __vector int ctr1, uint32 ctr2, uint uint32 key0, uint32 key1, __vector double & rnd1, __vector double & rnd2) { - philox_double2(ctr0, (__vector uint32) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2); + philox_double2(ctr0, (__vector unsigned int) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2); } #endif #endif diff --git a/pytest.ini b/pytest.ini index e7b0eeb98f38cb30734a302aae3aad742e4b643e..db4823c05dd55d97262a02e25ad8652bf3f17d01 100644 --- a/pytest.ini +++ b/pytest.ini @@ -41,7 +41,7 @@ exclude_lines = if __name__ == .__main__.: skip_covered = True -fail_under = 89 +fail_under = 88 [html] directory = coverage_report