Endianness fix

059e2eef · Michael Kuron · 3f9b4068 · 059e2eef · 059e2eef · 059e2eef
Commit 059e2eef authored 4 years ago by Michael Kuron
--- a/pystencils/backends/arm_instruction_sets.py
+++ b/pystencils/backends/arm_instruction_sets.py
@@ -14,6 +14,9 @@ def get_argument_string(function_shortcut):
 def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q_registers=True):
+    if instruction_set != 'neon':
+        raise NotImplementedError(instruction_set)
    base_names = {
        '+': 'add[0, 1]',
        '-': 'sub[0, 1]',

--- a/pystencils/backends/ppc_instruction_sets.py
+++ b/pystencils/backends/ppc_instruction_sets.py
@@ -14,6 +14,9 @@ def get_argument_string(function_shortcut):
 def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
+    if instruction_set != 'vsx':
+        raise NotImplementedError(instruction_set)
    base_names = {
        '+': 'add[0, 1]',
        '-': 'sub[0, 1]',

--- a/pystencils/backends/simd_instruction_sets.py
+++ b/pystencils/backends/simd_instruction_sets.py
@@ -21,7 +21,7 @@ def get_supported_instruction_sets():
    elif platform.machine().startswith('ppc64'):  # no flags reported by cpuinfo
        import subprocess
        from pystencils.cpu.cpujit import get_compiler_config
-        command = [get_compiler_config()['command'], '-dM', '-E', '-']
+        command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', '-']
        macros = subprocess.check_output(command, input='', text=True)
        if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros:
            return ['vsx']

--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
@@ -44,14 +44,16 @@ typedef std::uint64_t uint64;
 QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
 {
+#ifndef __CUDA_ARCH__
+    // host code
 #ifdef __powerpc__
    *hip = __mulhwu(a,b);
    return a*b;
-#elif !defined(__CUDA_ARCH__)
+#else
-    // host code
    uint64 product = ((uint64)a) * ((uint64)b);
    *hip = product >> 32;
    return (uint32)product;
+#endif
 #else
    // device code
    *hip = __umulhi(a,b);
@@ -296,15 +298,29 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ct
 #ifdef __ALTIVEC__
 QUALIFIERS void _philox4x32round(__vector uint32* ctr, __vector uint32* key)
 {
+#ifdef __POWER10_VECTOR__
+    __vector uint32 lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector uint32 lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
+    __vector uint32 hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector uint32 hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1));
+#else
    __vector uint32 lohi0a = (__vector uint32) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0));
    __vector uint32 lohi0b = (__vector uint32) vec_mulo(ctr[0], vec_splats(PHILOX_M4x32_0));
    __vector uint32 lohi1a = (__vector uint32) vec_mule(ctr[2], vec_splats(PHILOX_M4x32_1));
    __vector uint32 lohi1b = (__vector uint32) vec_mulo(ctr[2], vec_splats(PHILOX_M4x32_1));
+#ifdef __LITTLE_ENDIAN__
    __vector uint32 lo0 = vec_mergee(lohi0a, lohi0b);
    __vector uint32 lo1 = vec_mergee(lohi1a, lohi1b);
    __vector uint32 hi0 = vec_mergeo(lohi0a, lohi0b);
    __vector uint32 hi1 = vec_mergeo(lohi1a, lohi1b);
+#else
+    __vector uint32 lo0 = vec_mergeo(lohi0a, lohi0b);
+    __vector uint32 lo1 = vec_mergeo(lohi1a, lohi1b);
+    __vector uint32 hi0 = vec_mergee(lohi0a, lohi0b);
+    __vector uint32 hi1 = vec_mergee(lohi1a, lohi1b);
+#endif
+#endif
    ctr[0] = vec_xor(vec_xor(hi1, ctr[1]), key[0]);
    ctr[1] = lo1;
@@ -323,6 +339,7 @@ template<bool high>
 QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32 y)
 {
    // convert 32 to 64 bit
+#ifdef __LITTLE_ENDIAN__
    if (high)
    {
        x = vec_mergel(x, vec_splats(0U));
@@ -333,13 +350,31 @@ QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32
        x = vec_mergeh(x, vec_splats(0U));
        y = vec_mergeh(y, vec_splats(0U));
    }
+#else
+    if (high)
+    {
+        x = vec_mergel(vec_splats(0U), x);
+        y = vec_mergel(vec_splats(0U), y);
+    }
+    else
+    {
+        x = vec_mergeh(vec_splats(0U), x);
+        y = vec_mergeh(vec_splats(0U), y);
+    }
+#endif
    // calculate z = x ^ y << (53 - 32))
    __vector uint64 z = vec_sl((__vector uint64) y, vec_splats(53ULL - 32ULL));
    z = vec_xor((__vector uint64) x, z);
    // convert uint64 to double
-    __vector double rs = __builtin_convertvector(z, __vector double); // vec_ctd(z, 0) is documented but not available
+#if defined(__has_builtin) && __has_builtin(__builtin_convertvector)
+    __vector double rs = __builtin_convertvector(z, __vector double);
+#elif defined(__GNUC__) && __GNUC__ >= 8
+    __vector double rs = vec_ctf(z, 0);
+#else
+    __vector double rs = vec_ctd(z, 0);
+#endif
    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
    rs = vec_madd(rs, vec_splats(TWOPOW53_INV_DOUBLE), vec_splats(TWOPOW53_INV_DOUBLE/2.0));