diff --git a/pystencils/backends/arm_instruction_sets.py b/pystencils/backends/arm_instruction_sets.py index 26f61e909ee115c5b9e081877a232fdb082d2b7e..5a3703c7c0efa07021b5047e5d7b9dd09acad797 100644 --- a/pystencils/backends/arm_instruction_sets.py +++ b/pystencils/backends/arm_instruction_sets.py @@ -14,6 +14,9 @@ def get_argument_string(function_shortcut): def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q_registers=True): + if instruction_set != 'neon': + raise NotImplementedError(instruction_set) + base_names = { '+': 'add[0, 1]', '-': 'sub[0, 1]', diff --git a/pystencils/backends/ppc_instruction_sets.py b/pystencils/backends/ppc_instruction_sets.py index 377715456bcba8e09882bae86a7f51b9807a5839..e3421e589e555f639c01274fd7bd99991b671247 100644 --- a/pystencils/backends/ppc_instruction_sets.py +++ b/pystencils/backends/ppc_instruction_sets.py @@ -14,6 +14,9 @@ def get_argument_string(function_shortcut): def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): + if instruction_set != 'vsx': + raise NotImplementedError(instruction_set) + base_names = { '+': 'add[0, 1]', '-': 'sub[0, 1]', diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py index 4e0f55fb55f860ff8b1224645989cc1249fa0148..b3418eb3083f804338d804b1cb0fa709852ddbea 100644 --- a/pystencils/backends/simd_instruction_sets.py +++ b/pystencils/backends/simd_instruction_sets.py @@ -21,7 +21,7 @@ def get_supported_instruction_sets(): elif platform.machine().startswith('ppc64'): # no flags reported by cpuinfo import subprocess from pystencils.cpu.cpujit import get_compiler_config - command = [get_compiler_config()['command'], '-dM', '-E', '-'] + command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', '-'] macros = subprocess.check_output(command, input='', text=True) if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros: return ['vsx'] diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h index a38c3e165bd8a2ec244d0ac175977a4f9a475c4b..cda6fd0cff2537d3e3cf345eefbdfec250b8696d 100644 --- a/pystencils/include/philox_rand.h +++ b/pystencils/include/philox_rand.h @@ -44,14 +44,16 @@ typedef std::uint64_t uint64; QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip) { +#ifndef __CUDA_ARCH__ + // host code #ifdef __powerpc__ *hip = __mulhwu(a,b); return a*b; -#elif !defined(__CUDA_ARCH__) - // host code +#else uint64 product = ((uint64)a) * ((uint64)b); *hip = product >> 32; return (uint32)product; +#endif #else // device code *hip = __umulhi(a,b); @@ -296,15 +298,29 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ct #ifdef __ALTIVEC__ QUALIFIERS void _philox4x32round(__vector uint32* ctr, __vector uint32* key) { +#ifdef __POWER10_VECTOR__ + __vector uint32 lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0)); + __vector uint32 lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1)); + __vector uint32 hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0)); + __vector uint32 hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1)); +#else __vector uint32 lohi0a = (__vector uint32) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0)); __vector uint32 lohi0b = (__vector uint32) vec_mulo(ctr[0], vec_splats(PHILOX_M4x32_0)); __vector uint32 lohi1a = (__vector uint32) vec_mule(ctr[2], vec_splats(PHILOX_M4x32_1)); __vector uint32 lohi1b = (__vector uint32) vec_mulo(ctr[2], vec_splats(PHILOX_M4x32_1)); +#ifdef __LITTLE_ENDIAN__ __vector uint32 lo0 = vec_mergee(lohi0a, lohi0b); __vector uint32 lo1 = vec_mergee(lohi1a, lohi1b); __vector uint32 hi0 = vec_mergeo(lohi0a, lohi0b); __vector uint32 hi1 = vec_mergeo(lohi1a, lohi1b); +#else + __vector uint32 lo0 = vec_mergeo(lohi0a, lohi0b); + __vector uint32 lo1 = vec_mergeo(lohi1a, lohi1b); + __vector uint32 hi0 = vec_mergee(lohi0a, lohi0b); + __vector uint32 hi1 = vec_mergee(lohi1a, lohi1b); +#endif +#endif ctr[0] = vec_xor(vec_xor(hi1, ctr[1]), key[0]); ctr[1] = lo1; @@ -323,6 +339,7 @@ template<bool high> QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32 y) { // convert 32 to 64 bit +#ifdef __LITTLE_ENDIAN__ if (high) { x = vec_mergel(x, vec_splats(0U)); @@ -333,13 +350,31 @@ QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32 x = vec_mergeh(x, vec_splats(0U)); y = vec_mergeh(y, vec_splats(0U)); } +#else + if (high) + { + x = vec_mergel(vec_splats(0U), x); + y = vec_mergel(vec_splats(0U), y); + } + else + { + x = vec_mergeh(vec_splats(0U), x); + y = vec_mergeh(vec_splats(0U), y); + } +#endif // calculate z = x ^ y << (53 - 32)) __vector uint64 z = vec_sl((__vector uint64) y, vec_splats(53ULL - 32ULL)); z = vec_xor((__vector uint64) x, z); // convert uint64 to double - __vector double rs = __builtin_convertvector(z, __vector double); // vec_ctd(z, 0) is documented but not available +#if defined(__has_builtin) && __has_builtin(__builtin_convertvector) + __vector double rs = __builtin_convertvector(z, __vector double); +#elif defined(__GNUC__) && __GNUC__ >= 8 + __vector double rs = vec_ctf(z, 0); +#else + __vector double rs = vec_ctd(z, 0); +#endif // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0) rs = vec_madd(rs, vec_splats(TWOPOW53_INV_DOUBLE), vec_splats(TWOPOW53_INV_DOUBLE/2.0));