From 0b99c23140b3419757e958b09be277a9bcd443dd Mon Sep 17 00:00:00 2001
From: Michael Kuron <m.kuron@gmx.de>
Date: Thu, 25 Mar 2021 10:14:28 +0100
Subject: [PATCH] include pveclib header if needed and use LRU store for stream

---
 pystencils/backends/ppc_instruction_sets.py |  2 +-
 pystencils/include/philox_rand.h            | 20 ++++++++++++++++----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/pystencils/backends/ppc_instruction_sets.py b/pystencils/backends/ppc_instruction_sets.py
index f4df9d3b2..bb9e3e851 100644
--- a/pystencils/backends/ppc_instruction_sets.py
+++ b/pystencils/backends/ppc_instruction_sets.py
@@ -29,7 +29,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
         'loadA': 'ld[0x0, 0]',
         'storeU': 'xst[1, 0x0, 0]',
         'storeA': 'st[1, 0x0, 0]',
-        'stream': 'st[1, 0x0, 0]',
+        'stream': 'stl[1, 0x0, 0]',
 
         'abs': 'abs[0]',
         '==': 'cmpeq[0, 1]',
diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h
index 1b8b9d9f6..4d81d43e4 100644
--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
@@ -16,12 +16,15 @@
 #include <arm_neon.h>
 #endif
 
-#if defined(__powerpc__) && defined(__GNUC__) && !defined(__clang__) && !defined(__ibmxl__)
+#if defined(__powerpc__) && defined(__GNUC__) && !defined(__clang__) && !defined(__xlC__)
 #include <ppu_intrinsics.h>
 #endif
 #ifdef __ALTIVEC__
 #include <altivec.h>
 #undef bool
+#ifndef _ARCH_PWR8
+#include <pveclib/vec_int64_ppc.h>
+#endif
 #endif
 
 #ifndef __CUDA_ARCH__
@@ -46,7 +49,7 @@ QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
 {
 #ifndef __CUDA_ARCH__
     // host code
-#if defined(__powerpc__) && (!defined(__clang__) || defined(__ibmxl__))
+#if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
     *hip = __mulhwu(a,b);
     return a*b;
 #else
@@ -298,7 +301,12 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ct
 #ifdef __ALTIVEC__
 QUALIFIERS void _philox4x32round(__vector unsigned int* ctr, __vector unsigned int* key)
 {
-#ifdef __POWER10_VECTOR__
+#ifndef _ARCH_PWR8
+    __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
+    __vector unsigned int hi0 = vec_mulhuw(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector unsigned int hi1 = vec_mulhuw(ctr[2], vec_splats(PHILOX_M4x32_1));
+#elif defined(_ARCH_PWR10)
     __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
     __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
     __vector unsigned int hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0));
@@ -364,11 +372,15 @@ QUALIFIERS __vector double _uniform_double_hq(__vector unsigned int x, __vector
 #endif
 
     // calculate z = x ^ y << (53 - 32))
+#ifdef _ARCH_PWR8
     __vector unsigned long long z = vec_sl((__vector unsigned long long) y, vec_splats(53ULL - 32ULL));
+#else
+    __vector unsigned long long z = vec_vsld((__vector unsigned long long) y, vec_splats(53ULL - 32ULL));
+#endif
     z = vec_xor((__vector unsigned long long) x, z);
 
     // convert uint64 to double
-#ifdef __ibmxl__
+#ifdef __xlC__
     __vector double rs = vec_ctd(z, 0);
 #else
     __vector double rs = vec_ctf(z, 0);
-- 
GitLab