From 1f800cbabbaf361098d7b46cc859cf4e70f0d644 Mon Sep 17 00:00:00 2001
From: Michael Kuron <mkuron@icp.uni-stuttgart.de>
Date: Fri, 26 Feb 2021 09:12:11 +0000
Subject: [PATCH] ARM has a cache line zero instruction that prevents data that
 will be overwritten anyway from being loaded from RAM. Kind of a non-temporal
 store light.

---
 pystencils/include/arm_neon_helpers.h | 29 +++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/pystencils/include/arm_neon_helpers.h b/pystencils/include/arm_neon_helpers.h
index ba6cbc2d7..0db982860 100644
--- a/pystencils/include/arm_neon_helpers.h
+++ b/pystencils/include/arm_neon_helpers.h
@@ -17,3 +17,32 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d)
     alignas(16) int data[4] = {a, b, c, d};
     return vld1q_s32(data);
 }
+
+// ZVA size is usually 64 bytes, but needs to be checked
+int zva_size() {
+    uint64_t dczid;
+    asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
+    if ((dczid & (1 << 4)) == 0) {
+        int size = 4 << (dczid & 0xf);
+        return size;
+    }
+    return 0;
+}
+
+// write zva_size bytes of float vectors to memory aligned to zva_size bytes
+inline void stream_f32(float32x4_t f[4], float* p) {
+    asm volatile("dc zva, %0"::"r"(p));
+    vst1q_f32(p, f[0]);
+    vst1q_f32(p+4, f[1]);
+    vst1q_f32(p+8, f[2]);
+    vst1q_f32(p+12, f[3]);
+}
+
+// write zva_size bytes of double vectors to memory aligned to zva_size bytes
+inline void stream_f64(float64x2_t f[4], double* p) {
+    asm volatile("dc zva, %0"::"r"(p));
+    vst1q_f64(p, f[0]);
+    vst1q_f64(p+2, f[1]);
+    vst1q_f64(p+4, f[2]);
+    vst1q_f64(p+6, f[3]);
+}
-- 
GitLab