diff --git a/pystencils/backends/arm_instruction_sets.py b/pystencils/backends/arm_instruction_sets.py
index 26f61e909ee115c5b9e081877a232fdb082d2b7e..5a3703c7c0efa07021b5047e5d7b9dd09acad797 100644
--- a/pystencils/backends/arm_instruction_sets.py
+++ b/pystencils/backends/arm_instruction_sets.py
@@ -14,6 +14,9 @@ def get_argument_string(function_shortcut):
 
 
 def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q_registers=True):
+    if instruction_set != 'neon':
+        raise NotImplementedError(instruction_set)
+
     base_names = {
         '+': 'add[0, 1]',
         '-': 'sub[0, 1]',
diff --git a/pystencils/backends/ppc_instruction_sets.py b/pystencils/backends/ppc_instruction_sets.py
index 377715456bcba8e09882bae86a7f51b9807a5839..e3421e589e555f639c01274fd7bd99991b671247 100644
--- a/pystencils/backends/ppc_instruction_sets.py
+++ b/pystencils/backends/ppc_instruction_sets.py
@@ -14,6 +14,9 @@ def get_argument_string(function_shortcut):
 
 
 def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
+    if instruction_set != 'vsx':
+        raise NotImplementedError(instruction_set)
+
     base_names = {
         '+': 'add[0, 1]',
         '-': 'sub[0, 1]',
diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py
index 4e0f55fb55f860ff8b1224645989cc1249fa0148..b3418eb3083f804338d804b1cb0fa709852ddbea 100644
--- a/pystencils/backends/simd_instruction_sets.py
+++ b/pystencils/backends/simd_instruction_sets.py
@@ -21,7 +21,7 @@ def get_supported_instruction_sets():
     elif platform.machine().startswith('ppc64'):  # no flags reported by cpuinfo
         import subprocess
         from pystencils.cpu.cpujit import get_compiler_config
-        command = [get_compiler_config()['command'], '-dM', '-E', '-']
+        command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', '-']
         macros = subprocess.check_output(command, input='', text=True)
         if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros:
             return ['vsx']
diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h
index a38c3e165bd8a2ec244d0ac175977a4f9a475c4b..cda6fd0cff2537d3e3cf345eefbdfec250b8696d 100644
--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
@@ -44,14 +44,16 @@ typedef std::uint64_t uint64;
 
 QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
 {
+#ifndef __CUDA_ARCH__
+    // host code
 #ifdef __powerpc__
     *hip = __mulhwu(a,b);
     return a*b;
-#elif !defined(__CUDA_ARCH__)
-    // host code
+#else
     uint64 product = ((uint64)a) * ((uint64)b);
     *hip = product >> 32;
     return (uint32)product;
+#endif
 #else
     // device code
     *hip = __umulhi(a,b);
@@ -296,15 +298,29 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ct
 #ifdef __ALTIVEC__
 QUALIFIERS void _philox4x32round(__vector uint32* ctr, __vector uint32* key)
 {
+#ifdef __POWER10_VECTOR__
+    __vector uint32 lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector uint32 lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
+    __vector uint32 hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector uint32 hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1));
+#else
     __vector uint32 lohi0a = (__vector uint32) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0));
     __vector uint32 lohi0b = (__vector uint32) vec_mulo(ctr[0], vec_splats(PHILOX_M4x32_0));
     __vector uint32 lohi1a = (__vector uint32) vec_mule(ctr[2], vec_splats(PHILOX_M4x32_1));
     __vector uint32 lohi1b = (__vector uint32) vec_mulo(ctr[2], vec_splats(PHILOX_M4x32_1));
 
+#ifdef __LITTLE_ENDIAN__
     __vector uint32 lo0 = vec_mergee(lohi0a, lohi0b);
     __vector uint32 lo1 = vec_mergee(lohi1a, lohi1b);
     __vector uint32 hi0 = vec_mergeo(lohi0a, lohi0b);
     __vector uint32 hi1 = vec_mergeo(lohi1a, lohi1b);
+#else
+    __vector uint32 lo0 = vec_mergeo(lohi0a, lohi0b);
+    __vector uint32 lo1 = vec_mergeo(lohi1a, lohi1b);
+    __vector uint32 hi0 = vec_mergee(lohi0a, lohi0b);
+    __vector uint32 hi1 = vec_mergee(lohi1a, lohi1b);
+#endif
+#endif
 
     ctr[0] = vec_xor(vec_xor(hi1, ctr[1]), key[0]);
     ctr[1] = lo1;
@@ -323,6 +339,7 @@ template<bool high>
 QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32 y)
 {
     // convert 32 to 64 bit
+#ifdef __LITTLE_ENDIAN__
     if (high)
     {
         x = vec_mergel(x, vec_splats(0U));
@@ -333,13 +350,31 @@ QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32
         x = vec_mergeh(x, vec_splats(0U));
         y = vec_mergeh(y, vec_splats(0U));
     }
+#else
+    if (high)
+    {
+        x = vec_mergel(vec_splats(0U), x);
+        y = vec_mergel(vec_splats(0U), y);
+    }
+    else
+    {
+        x = vec_mergeh(vec_splats(0U), x);
+        y = vec_mergeh(vec_splats(0U), y);
+    }
+#endif
 
     // calculate z = x ^ y << (53 - 32))
     __vector uint64 z = vec_sl((__vector uint64) y, vec_splats(53ULL - 32ULL));
     z = vec_xor((__vector uint64) x, z);
 
     // convert uint64 to double
-    __vector double rs = __builtin_convertvector(z, __vector double); // vec_ctd(z, 0) is documented but not available
+#if defined(__has_builtin) && __has_builtin(__builtin_convertvector)
+    __vector double rs = __builtin_convertvector(z, __vector double);
+#elif defined(__GNUC__) && __GNUC__ >= 8
+    __vector double rs = vec_ctf(z, 0);
+#else
+    __vector double rs = vec_ctd(z, 0);
+#endif
     // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
     rs = vec_madd(rs, vec_splats(TWOPOW53_INV_DOUBLE), vec_splats(TWOPOW53_INV_DOUBLE/2.0));