From d20cc1b8fbbff47293df94761849ffc4aa075d0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B6nig?= <jan.hoenig@fau.de>
Date: Fri, 12 Nov 2021 17:45:52 +0100
Subject: [PATCH] Working vectorizer

---
 examle/test.py                                | 32 +++++++++++++------
 pystencils_benchmark/templates/Clang.mk       |  4 +--
 pystencils_benchmark/templates/GCC.mk         |  2 +-
 pystencils_benchmark/templates/GCCdebug.mk    |  3 +-
 pystencils_benchmark/templates/Makefile       |  2 +-
 .../templates/aligned_malloc.h                |  2 +-
 pystencils_benchmark/templates/main.c         |  3 +-
 7 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/examle/test.py b/examle/test.py
index d516678..ee5e32b 100755
--- a/examle/test.py
+++ b/examle/test.py
@@ -4,40 +4,52 @@ import subprocess
 import numpy as np
 import sympy as sp
 import pystencils as ps
-from pystencils_benchmark import generate_benchmark
+from pystencils_benchmark import generate_benchmark, Compiler
 from pathlib import Path
 
 
-def generate(path: Path):
+def generate(path: Path, compiler: Compiler):
     a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
     alpha = sp.symbols('alpha')
 
+    kernels = []
     @ps.kernel_config(ps.CreateKernelConfig())
     def vadd():
         a[0] @= b[0] + c[0]
-    kernel_vadd = ps.create_kernel(**vadd)
+    kernels.append(ps.create_kernel(**vadd))
+
+    @ps.kernel_config(ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': 'best'}))
+    def vadd_vector():
+        a[0] @= b[0] + c[0]
+    kernels.append(ps.create_kernel(**vadd_vector))
 
     @ps.kernel_config(ps.CreateKernelConfig())
     def daxpy():
         b[0] @= alpha * a[0] + b[0]
-    kernel_daxpy = ps.create_kernel(**daxpy)
+    kernels.append(ps.create_kernel(**daxpy))
+
+    @ps.kernel_config(ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': 'best'}))
+    def daxpy_vector():
+        b[0] @= alpha * a[0] + b[0]
+    kernels.append(ps.create_kernel(**daxpy_vector))
 
-    generate_benchmark([kernel_vadd, kernel_daxpy], path)
+    generate_benchmark(kernels, path, compiler=compiler)
 
 
 def make(path: Path):
-    subprocess.run(['make'])
+    subprocess.run(['make'], check=True)
 
 
-def execute(path: Path):
-    subprocess.run(['./benchmark-GCC', '200'])
+def execute(path: Path, compiler: Compiler):
+    subprocess.run([f'./benchmark-{compiler.name}', '100'], check=True)
 
 
 def main():
+    compiler = Compiler.GCCdebug
     path = Path.cwd()
-    generate(path)
+    generate(path, compiler)
     make(path)
-    execute(path)
+    execute(path, compiler)
 
 
 if __name__ == '__main__':
diff --git a/pystencils_benchmark/templates/Clang.mk b/pystencils_benchmark/templates/Clang.mk
index 61eee7f..be60d38 100644
--- a/pystencils_benchmark/templates/Clang.mk
+++ b/pystencils_benchmark/templates/Clang.mk
@@ -6,14 +6,14 @@ ANSI_CFLAGS += -std=c99
 ANSI_CFLAGS += -pedantic
 ANSI_CFLAGS += -Wextra
 
-CFLAGS   = -O3 -Wno-format -Wall $(ANSI_CFLAGS) -fopenmp
+CFLAGS   = -O3 -Wno-format -Wall $(ANSI_CFLAGS) -fopenmp -march=native
 # More warning pls
 #CFLAGS   += -Wfloat-equal -Wundef -Wshadow -Wpointer-arith -Wcast-align  -Wstrict-overflow=5 -Wwrite-strings -Waggregate-return
 # Maybe too much warnings
 #CFLAGS   += -Wcast-qual -Wswitch-default -Wconversion -Wunreachable-code
 # Specific C flags
 CFLAGS   := $(CFLAGS) -Wstrict-prototypes
-LFLAGS   = -fopenmp
+LFLAGS   = -fopenmp=libomp
 DEFINES  = -D_GNU_SOURCE -DNDEBUG
 INCLUDES =
 LIBS     =
diff --git a/pystencils_benchmark/templates/GCC.mk b/pystencils_benchmark/templates/GCC.mk
index c633858..46668e6 100644
--- a/pystencils_benchmark/templates/GCC.mk
+++ b/pystencils_benchmark/templates/GCC.mk
@@ -6,7 +6,7 @@ ANSI_CFLAGS += -std=c99
 ANSI_CFLAGS += -pedantic
 ANSI_CFLAGS += -Wextra
 
-CFLAGS   = -O3 -Wno-format -Wall $(ANSI_CFLAGS) -fopenmp
+CFLAGS   = -O3 -Wno-format -Wall $(ANSI_CFLAGS) -fopenmp -march=native
 # More warning pls
 #CFLAGS   += -Wfloat-equal -Wundef -Wshadow -Wpointer-arith -Wcast-align  -Wstrict-overflow=5 -Wwrite-strings -Waggregate-return
 # Maybe too much warnings
diff --git a/pystencils_benchmark/templates/GCCdebug.mk b/pystencils_benchmark/templates/GCCdebug.mk
index f453f0d..47b79c3 100644
--- a/pystencils_benchmark/templates/GCCdebug.mk
+++ b/pystencils_benchmark/templates/GCCdebug.mk
@@ -6,8 +6,7 @@ ANSI_CFLAGS += -std=c99
 ANSI_CFLAGS += -pedantic
 ANSI_CFLAGS += -Wextra
 
-CFLAGS   = -O0 -g -Wno-format  -Wall $(ANSI_CFLAGS)
-FCFLAGS  =
+CFLAGS   = -O0 -g -Wno-format  -Wall $(ANSI_CFLAGS) -march=native
 LFLAGS   =
 DEFINES  = -D_GNU_SOURCE
 INCLUDES =
diff --git a/pystencils_benchmark/templates/Makefile b/pystencils_benchmark/templates/Makefile
index b9b8cfc..98fcaaa 100644
--- a/pystencils_benchmark/templates/Makefile
+++ b/pystencils_benchmark/templates/Makefile
@@ -30,7 +30,7 @@ $(BUILD_DIR)/%.o:  %.c
 
 $(BUILD_DIR)/%.s:  %.c
 	@echo "===>  GENERATE ASM  $@"
-	$(Q)$(CC) -S $(CPPFLAGS) $(CFLAGS) $< -o $@
+	$(Q)$(CC) -S $(CFLAGS) $< -o $@
 
 tags:
 	@echo "===>  GENERATE  TAGS"
diff --git a/pystencils_benchmark/templates/aligned_malloc.h b/pystencils_benchmark/templates/aligned_malloc.h
index 5353157..52693f9 100644
--- a/pystencils_benchmark/templates/aligned_malloc.h
+++ b/pystencils_benchmark/templates/aligned_malloc.h
@@ -14,6 +14,6 @@ inline void* aligned_malloc(size_t size, size_t align) {
          if(posix_memalign(&result, align, size)) result = 0;
     #endif
     return result;
-};
+}
 
 #endif
diff --git a/pystencils_benchmark/templates/main.c b/pystencils_benchmark/templates/main.c
index 7f8ea99..1eea7c1 100644
--- a/pystencils_benchmark/templates/main.c
+++ b/pystencils_benchmark/templates/main.c
@@ -6,6 +6,7 @@
 #include <assert.h>
 
 #include "timing.h"
+#include "aligned_malloc.h"
 
 {%- for include in includes %}
 {{ include }}
@@ -26,7 +27,7 @@ int main(int argc, char **argv)
         {%- for field_name, dataType, elements, size, offset, alignment in kernel.fields %}
         // Initialization {{field_name}}
         {%- if alignment > 0 %}
-        {{dataType}} * {{field_name}} = ({{dataType}} *) aligned_malloc_with_offset({{size}}, {{alignment}}, {{offset}});
+        {{dataType}} * {{field_name}} = ({{dataType}} *) aligned_malloc({{size}}, {{alignment}});//, {{offset}});
         {%- else %}
         {{dataType}} * {{field_name}} = ({{dataType}} *) malloc({{size}});
         {%- endif %}
-- 
GitLab