Skip to content
Snippets Groups Projects
Commit 0b99c231 authored by Michael Kuron's avatar Michael Kuron :mortar_board:
Browse files

include pveclib header if needed and use LRU store for stream

parent 466a3426
No related branches found
No related tags found
1 merge request!228Vectorization improvements
Pipeline #31193 passed
...@@ -29,7 +29,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): ...@@ -29,7 +29,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
'loadA': 'ld[0x0, 0]', 'loadA': 'ld[0x0, 0]',
'storeU': 'xst[1, 0x0, 0]', 'storeU': 'xst[1, 0x0, 0]',
'storeA': 'st[1, 0x0, 0]', 'storeA': 'st[1, 0x0, 0]',
'stream': 'st[1, 0x0, 0]', 'stream': 'stl[1, 0x0, 0]',
'abs': 'abs[0]', 'abs': 'abs[0]',
'==': 'cmpeq[0, 1]', '==': 'cmpeq[0, 1]',
......
...@@ -16,12 +16,15 @@ ...@@ -16,12 +16,15 @@
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#if defined(__powerpc__) && defined(__GNUC__) && !defined(__clang__) && !defined(__ibmxl__) #if defined(__powerpc__) && defined(__GNUC__) && !defined(__clang__) && !defined(__xlC__)
#include <ppu_intrinsics.h> #include <ppu_intrinsics.h>
#endif #endif
#ifdef __ALTIVEC__ #ifdef __ALTIVEC__
#include <altivec.h> #include <altivec.h>
#undef bool #undef bool
#ifndef _ARCH_PWR8
#include <pveclib/vec_int64_ppc.h>
#endif
#endif #endif
#ifndef __CUDA_ARCH__ #ifndef __CUDA_ARCH__
...@@ -46,7 +49,7 @@ QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip) ...@@ -46,7 +49,7 @@ QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
{ {
#ifndef __CUDA_ARCH__ #ifndef __CUDA_ARCH__
// host code // host code
#if defined(__powerpc__) && (!defined(__clang__) || defined(__ibmxl__)) #if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
*hip = __mulhwu(a,b); *hip = __mulhwu(a,b);
return a*b; return a*b;
#else #else
...@@ -298,7 +301,12 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ct ...@@ -298,7 +301,12 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ct
#ifdef __ALTIVEC__ #ifdef __ALTIVEC__
QUALIFIERS void _philox4x32round(__vector unsigned int* ctr, __vector unsigned int* key) QUALIFIERS void _philox4x32round(__vector unsigned int* ctr, __vector unsigned int* key)
{ {
#ifdef __POWER10_VECTOR__ #ifndef _ARCH_PWR8
__vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector unsigned int hi0 = vec_mulhuw(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int hi1 = vec_mulhuw(ctr[2], vec_splats(PHILOX_M4x32_1));
#elif defined(_ARCH_PWR10)
__vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0)); __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1)); __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector unsigned int hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0)); __vector unsigned int hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0));
...@@ -364,11 +372,15 @@ QUALIFIERS __vector double _uniform_double_hq(__vector unsigned int x, __vector ...@@ -364,11 +372,15 @@ QUALIFIERS __vector double _uniform_double_hq(__vector unsigned int x, __vector
#endif #endif
// calculate z = x ^ y << (53 - 32)) // calculate z = x ^ y << (53 - 32))
#ifdef _ARCH_PWR8
__vector unsigned long long z = vec_sl((__vector unsigned long long) y, vec_splats(53ULL - 32ULL)); __vector unsigned long long z = vec_sl((__vector unsigned long long) y, vec_splats(53ULL - 32ULL));
#else
__vector unsigned long long z = vec_vsld((__vector unsigned long long) y, vec_splats(53ULL - 32ULL));
#endif
z = vec_xor((__vector unsigned long long) x, z); z = vec_xor((__vector unsigned long long) x, z);
// convert uint64 to double // convert uint64 to double
#ifdef __ibmxl__ #ifdef __xlC__
__vector double rs = vec_ctd(z, 0); __vector double rs = vec_ctd(z, 0);
#else #else
__vector double rs = vec_ctf(z, 0); __vector double rs = vec_ctf(z, 0);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment