Generate OpenMP code (still not working)

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>

Generate OpenMP code (still not working)
3a8e49af · Rafael Ravedutti · e7f04d1a · 3a8e49af · 3a8e49af · 3a8e49af
Commit 3a8e49af authored 1 year ago by Rafael Ravedutti
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ PYCMD=python3
 CC=mpicc
 #CC=mpiicpx
 #CC=mpiicpc
-CFLAGS=-Ofast -march=core-avx2 ${MPI_FLAGS} ${LIKWID_FLAGS}
+CFLAGS=-Ofast -march=core-avx2 -fopenmp ${MPI_FLAGS} ${LIKWID_FLAGS}
 #CFLAGS=-Ofast -xHost -qopt-zmm-usage=high ${MPI_FLAGS} ${LIKWID_FLAGS}
 #CFLAGS=-Ofast -xCORE-AVX512 -qopt-zmm-usage=high ${MPI_FLAGS} ${LIKWID_FLAGS}
 DEBUG_FLAGS=

--- a/runtime/devices/device.hpp
+++ b/runtime/devices/device.hpp
@@ -25,6 +25,29 @@ __host__ void copy_slice_to_host(const void *d_ptr, void *h_ptr, size_t offset,
 __host__ void copy_static_symbol_to_device(void *h_ptr, const void *d_ptr, size_t count);
 __host__ void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t count);

+#ifdef PAIRS_TARGET_OPENMP
+#include <omp.h>
+
+inline __host__ int host_atomic_add(int *addr, int val) {
+    int result;
+    #pragma omp critical
+    {
+        *addr += val;
+        result = *addr;
+    }
+    return result - val;
+}
+
+inline __host__ real_t host_atomic_add(real_t *addr, real_t val) {
+    real_t result;
+    #pragma omp critical
+    {
+        *addr += val;
+        result = *addr;
+    }
+    return result - val;
+}
+#else
 inline __host__ int host_atomic_add(int *addr, int val) {
    *addr += val;
    return *addr - val;
@@ -35,6 +58,7 @@ inline __host__ real_t host_atomic_add(real_t *addr, real_t val) {
    *addr += val;
    return tmp;
 }
+#endif

 inline __host__ int host_atomic_add_resize_check(int *addr, int val, int *resize, int capacity) {
    const int add_res = *addr + val;

--- a/src/pairs/__init__.py
+++ b/src/pairs/__init__.py
@@ -21,7 +21,10 @@ def simulation(
        CGen(ref, debug), shapes, dims, timesteps, double_prec, use_contact_history,
        particle_capacity, neighbor_capacity)

-def target_cpu():
+def target_cpu(parallel=False):
+    if parallel:
+        return Target(Target.Backend_CPP, [Target.Feature_CPU, Target.Feature_OpenMP])
+
    return Target(Target.Backend_CPP, Target.Feature_CPU)

 def target_gpu():

--- a/src/pairs/code_gen/cgen.py
+++ b/src/pairs/code_gen/cgen.py
@@ -62,6 +62,10 @@ class CGen:
        if self.target.is_gpu():
            self.print("#define PAIRS_TARGET_CUDA")

+        if self.target.is_openmp():
+            self.print("#define PAIRS_TARGET_OPENMP")
+            self.print("#include <omp.h>")
+
        self.print("#include <limits.h>")
        self.print("#include <math.h>")
        self.print("#include <stdbool.h>")
@@ -507,6 +511,10 @@ class CGen:
            iterator = self.generate_expression(ast_node.iterator)
            lower_range = self.generate_expression(ast_node.min)
            upper_range = self.generate_expression(ast_node.max)
+
+            if self.target.is_openmp() and ast_node.is_kernel_candidate():
+                self.print("#pragma omp parallel for")
+
            self.print(f"for(int {iterator} = {lower_range}; {iterator} < {upper_range}; {iterator}++) {{")
            self.generate_statement(ast_node.block)
            self.print("}")

--- a/src/pairs/code_gen/target.py
+++ b/src/pairs/code_gen/target.py
@@ -14,6 +14,7 @@ class Target:
    Feature_AVX2 = 3
    Feature_AVX512 = 4
    Feature_GPU = 5
+    Feature_OpenMP = 6

    # Operating system
    OS_Unknown = 0
@@ -37,3 +38,6 @@ class Target:

    def is_gpu(self):
        return self.has_feature(Target.Feature_GPU)
+
+    def is_openmp(self):
+        return self.has_feature(Target.Feature_OpenMP)