diff --git a/pystencils/include/arm_neon_helpers.h b/pystencils/include/arm_neon_helpers.h index ba6cbc2d7bae45591bcec580b98394c4f6830339..0db982860b1ef53d5520a4a696f75371bbc651b3 100644 --- a/pystencils/include/arm_neon_helpers.h +++ b/pystencils/include/arm_neon_helpers.h @@ -17,3 +17,32 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d) alignas(16) int data[4] = {a, b, c, d}; return vld1q_s32(data); } + +// ZVA size is usually 64 bytes, but needs to be checked +int zva_size() { + uint64_t dczid; + asm volatile ("mrs %0, dczid_el0" : "=r"(dczid)); + if ((dczid & (1 << 4)) == 0) { + int size = 4 << (dczid & 0xf); + return size; + } + return 0; +} + +// write zva_size bytes of float vectors to memory aligned to zva_size bytes +inline void stream_f32(float32x4_t f[4], float* p) { + asm volatile("dc zva, %0"::"r"(p)); + vst1q_f32(p, f[0]); + vst1q_f32(p+4, f[1]); + vst1q_f32(p+8, f[2]); + vst1q_f32(p+12, f[3]); +} + +// write zva_size bytes of double vectors to memory aligned to zva_size bytes +inline void stream_f64(float64x2_t f[4], double* p) { + asm volatile("dc zva, %0"::"r"(p)); + vst1q_f64(p, f[0]); + vst1q_f64(p+2, f[1]); + vst1q_f64(p+4, f[2]); + vst1q_f64(p+6, f[3]); +}