From f56660e95328bd7b957ee824a0f80c7a2be1d5c9 Mon Sep 17 00:00:00 2001 From: Michael Kuron <m.kuron@gmx.de> Date: Fri, 21 May 2021 20:52:21 +0200 Subject: [PATCH] reorder mulhs so they go with the corresponding muls Supposedly some processors and compilers will fuse mul+mulh into one instruction --- pystencils/include/philox_rand.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h index e06eb4fe6..0571eb39d 100644 --- a/pystencils/include/philox_rand.h +++ b/pystencils/include/philox_rand.h @@ -351,13 +351,13 @@ QUALIFIERS void _philox4x32round(__vector unsigned int* ctr, __vector unsigned i { #ifndef _ARCH_PWR8 __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0)); - __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1)); __vector unsigned int hi0 = vec_mulhuw(ctr[0], vec_splats(PHILOX_M4x32_0)); + __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1)); __vector unsigned int hi1 = vec_mulhuw(ctr[2], vec_splats(PHILOX_M4x32_1)); #elif defined(_ARCH_PWR10) __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0)); - __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1)); __vector unsigned int hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0)); + __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1)); __vector unsigned int hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1)); #else __vector unsigned int lohi0a = (__vector unsigned int) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0)); @@ -708,8 +708,8 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key) { svuint32_t lo0 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0)); - svuint32_t lo1 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 2), svdup_u32(PHILOX_M4x32_1)); svuint32_t hi0 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0)); + svuint32_t lo1 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 2), svdup_u32(PHILOX_M4x32_1)); svuint32_t hi1 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 2), svdup_u32(PHILOX_M4x32_1)); ctr = svset4_u32(ctr, 0, sveor_u32_x(svptrue_b32(), sveor_u32_x(svptrue_b32(), hi1, svget4_u32(ctr, 1)), svget2_u32(key, 0))); @@ -860,8 +860,8 @@ QUALIFIERS void _philox4x32round(vuint32m1_t & ctr0, vuint32m1_t & ctr1, vuint32 vuint32m1_t key0, vuint32m1_t key1) { vuint32m1_t lo0 = vmul_vv_u32m1(ctr0, vmv_v_x_u32m1(PHILOX_M4x32_0, vsetvlmax_e32m1()), vsetvlmax_e32m1()); - vuint32m1_t lo1 = vmul_vv_u32m1(ctr2, vmv_v_x_u32m1(PHILOX_M4x32_1, vsetvlmax_e32m1()), vsetvlmax_e32m1()); vuint32m1_t hi0 = vmulhu_vv_u32m1(ctr0, vmv_v_x_u32m1(PHILOX_M4x32_0, vsetvlmax_e32m1()), vsetvlmax_e32m1()); + vuint32m1_t lo1 = vmul_vv_u32m1(ctr2, vmv_v_x_u32m1(PHILOX_M4x32_1, vsetvlmax_e32m1()), vsetvlmax_e32m1()); vuint32m1_t hi1 = vmulhu_vv_u32m1(ctr2, vmv_v_x_u32m1(PHILOX_M4x32_1, vsetvlmax_e32m1()), vsetvlmax_e32m1()); ctr0 = vxor_vv_u32m1(vxor_vv_u32m1(hi1, ctr1, vsetvlmax_e32m1()), key0, vsetvlmax_e32m1()); -- GitLab