Skip to content
Snippets Groups Projects
Commit f56660e9 authored by Michael Kuron's avatar Michael Kuron :mortar_board:
Browse files

reorder mulhs so they go with the corresponding muls

Supposedly some processors and compilers will fuse mul+mulh into one instruction
parent 144c6e14
No related branches found
No related tags found
1 merge request!247OpenCL RNG
Pipeline #32240 passed
...@@ -351,13 +351,13 @@ QUALIFIERS void _philox4x32round(__vector unsigned int* ctr, __vector unsigned i ...@@ -351,13 +351,13 @@ QUALIFIERS void _philox4x32round(__vector unsigned int* ctr, __vector unsigned i
{ {
#ifndef _ARCH_PWR8 #ifndef _ARCH_PWR8
__vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0)); __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector unsigned int hi0 = vec_mulhuw(ctr[0], vec_splats(PHILOX_M4x32_0)); __vector unsigned int hi0 = vec_mulhuw(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector unsigned int hi1 = vec_mulhuw(ctr[2], vec_splats(PHILOX_M4x32_1)); __vector unsigned int hi1 = vec_mulhuw(ctr[2], vec_splats(PHILOX_M4x32_1));
#elif defined(_ARCH_PWR10) #elif defined(_ARCH_PWR10)
__vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0)); __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector unsigned int hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0)); __vector unsigned int hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector unsigned int hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1)); __vector unsigned int hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1));
#else #else
__vector unsigned int lohi0a = (__vector unsigned int) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0)); __vector unsigned int lohi0a = (__vector unsigned int) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0));
...@@ -708,8 +708,8 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ...@@ -708,8 +708,8 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32
QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key) QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key)
{ {
svuint32_t lo0 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0)); svuint32_t lo0 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
svuint32_t lo1 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 2), svdup_u32(PHILOX_M4x32_1));
svuint32_t hi0 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0)); svuint32_t hi0 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
svuint32_t lo1 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 2), svdup_u32(PHILOX_M4x32_1));
svuint32_t hi1 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 2), svdup_u32(PHILOX_M4x32_1)); svuint32_t hi1 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 2), svdup_u32(PHILOX_M4x32_1));
ctr = svset4_u32(ctr, 0, sveor_u32_x(svptrue_b32(), sveor_u32_x(svptrue_b32(), hi1, svget4_u32(ctr, 1)), svget2_u32(key, 0))); ctr = svset4_u32(ctr, 0, sveor_u32_x(svptrue_b32(), sveor_u32_x(svptrue_b32(), hi1, svget4_u32(ctr, 1)), svget2_u32(key, 0)));
...@@ -860,8 +860,8 @@ QUALIFIERS void _philox4x32round(vuint32m1_t & ctr0, vuint32m1_t & ctr1, vuint32 ...@@ -860,8 +860,8 @@ QUALIFIERS void _philox4x32round(vuint32m1_t & ctr0, vuint32m1_t & ctr1, vuint32
vuint32m1_t key0, vuint32m1_t key1) vuint32m1_t key0, vuint32m1_t key1)
{ {
vuint32m1_t lo0 = vmul_vv_u32m1(ctr0, vmv_v_x_u32m1(PHILOX_M4x32_0, vsetvlmax_e32m1()), vsetvlmax_e32m1()); vuint32m1_t lo0 = vmul_vv_u32m1(ctr0, vmv_v_x_u32m1(PHILOX_M4x32_0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
vuint32m1_t lo1 = vmul_vv_u32m1(ctr2, vmv_v_x_u32m1(PHILOX_M4x32_1, vsetvlmax_e32m1()), vsetvlmax_e32m1());
vuint32m1_t hi0 = vmulhu_vv_u32m1(ctr0, vmv_v_x_u32m1(PHILOX_M4x32_0, vsetvlmax_e32m1()), vsetvlmax_e32m1()); vuint32m1_t hi0 = vmulhu_vv_u32m1(ctr0, vmv_v_x_u32m1(PHILOX_M4x32_0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
vuint32m1_t lo1 = vmul_vv_u32m1(ctr2, vmv_v_x_u32m1(PHILOX_M4x32_1, vsetvlmax_e32m1()), vsetvlmax_e32m1());
vuint32m1_t hi1 = vmulhu_vv_u32m1(ctr2, vmv_v_x_u32m1(PHILOX_M4x32_1, vsetvlmax_e32m1()), vsetvlmax_e32m1()); vuint32m1_t hi1 = vmulhu_vv_u32m1(ctr2, vmv_v_x_u32m1(PHILOX_M4x32_1, vsetvlmax_e32m1()), vsetvlmax_e32m1());
ctr0 = vxor_vv_u32m1(vxor_vv_u32m1(hi1, ctr1, vsetvlmax_e32m1()), key0, vsetvlmax_e32m1()); ctr0 = vxor_vv_u32m1(vxor_vv_u32m1(hi1, ctr1, vsetvlmax_e32m1()), key0, vsetvlmax_e32m1());
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment