diff --git a/Makefile b/Makefile index 1a0887c774707bcf82dc013a562d98daf26905f1..c823f1e8862661ebb103feaf7dae3edbf371c74e 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ PYCMD=python3 CC=mpicc #CC=mpiicpx #CC=mpiicpc -CFLAGS=-Ofast -march=core-avx2 ${MPI_FLAGS} ${LIKWID_FLAGS} +CFLAGS=-Ofast -march=core-avx2 -fopenmp ${MPI_FLAGS} ${LIKWID_FLAGS} #CFLAGS=-Ofast -xHost -qopt-zmm-usage=high ${MPI_FLAGS} ${LIKWID_FLAGS} #CFLAGS=-Ofast -xCORE-AVX512 -qopt-zmm-usage=high ${MPI_FLAGS} ${LIKWID_FLAGS} DEBUG_FLAGS= diff --git a/runtime/devices/device.hpp b/runtime/devices/device.hpp index ade04053e90b28feac36f4738725aa5464e5fd8f..107b70ee91512ed9ccd336be2168e0b75ed5eab8 100644 --- a/runtime/devices/device.hpp +++ b/runtime/devices/device.hpp @@ -25,6 +25,29 @@ __host__ void copy_slice_to_host(const void *d_ptr, void *h_ptr, size_t offset, __host__ void copy_static_symbol_to_device(void *h_ptr, const void *d_ptr, size_t count); __host__ void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t count); +#ifdef PAIRS_TARGET_OPENMP +#include <omp.h> + +inline __host__ int host_atomic_add(int *addr, int val) { + int result; + #pragma omp critical + { + *addr += val; + result = *addr; + } + return result - val; +} + +inline __host__ real_t host_atomic_add(real_t *addr, real_t val) { + real_t result; + #pragma omp critical + { + *addr += val; + result = *addr; + } + return result - val; +} +#else inline __host__ int host_atomic_add(int *addr, int val) { *addr += val; return *addr - val; @@ -35,6 +58,7 @@ inline __host__ real_t host_atomic_add(real_t *addr, real_t val) { *addr += val; return tmp; } +#endif inline __host__ int host_atomic_add_resize_check(int *addr, int val, int *resize, int capacity) { const int add_res = *addr + val; diff --git a/src/pairs/__init__.py b/src/pairs/__init__.py index ab81066421f8c7f359c2a35694009b229395ef6d..e89e0a79cbb8f3e39143dc3461a9734a6dcf8d7b 100644 --- a/src/pairs/__init__.py +++ b/src/pairs/__init__.py @@ -21,7 +21,10 @@ def simulation( CGen(ref, debug), shapes, dims, timesteps, double_prec, use_contact_history, particle_capacity, neighbor_capacity) -def target_cpu(): +def target_cpu(parallel=False): + if parallel: + return Target(Target.Backend_CPP, [Target.Feature_CPU, Target.Feature_OpenMP]) + return Target(Target.Backend_CPP, Target.Feature_CPU) def target_gpu(): diff --git a/src/pairs/code_gen/cgen.py b/src/pairs/code_gen/cgen.py index adf3fb95dccead46f9a3ec0fd9df4f65a08e43ea..0d6d8b05c0bf75d410e835e22144e89514bd83a9 100644 --- a/src/pairs/code_gen/cgen.py +++ b/src/pairs/code_gen/cgen.py @@ -62,6 +62,10 @@ class CGen: if self.target.is_gpu(): self.print("#define PAIRS_TARGET_CUDA") + if self.target.is_openmp(): + self.print("#define PAIRS_TARGET_OPENMP") + self.print("#include <omp.h>") + self.print("#include <limits.h>") self.print("#include <math.h>") self.print("#include <stdbool.h>") @@ -507,6 +511,10 @@ class CGen: iterator = self.generate_expression(ast_node.iterator) lower_range = self.generate_expression(ast_node.min) upper_range = self.generate_expression(ast_node.max) + + if self.target.is_openmp() and ast_node.is_kernel_candidate(): + self.print("#pragma omp parallel for") + self.print(f"for(int {iterator} = {lower_range}; {iterator} < {upper_range}; {iterator}++) {{") self.generate_statement(ast_node.block) self.print("}") diff --git a/src/pairs/code_gen/target.py b/src/pairs/code_gen/target.py index 30acef52b2dd7399a74742e5b943063c94183716..ef569ea1bb3871cc3d6fde3f851a91e44e4e1e44 100644 --- a/src/pairs/code_gen/target.py +++ b/src/pairs/code_gen/target.py @@ -14,6 +14,7 @@ class Target: Feature_AVX2 = 3 Feature_AVX512 = 4 Feature_GPU = 5 + Feature_OpenMP = 6 # Operating system OS_Unknown = 0 @@ -37,3 +38,6 @@ class Target: def is_gpu(self): return self.has_feature(Target.Feature_GPU) + + def is_openmp(self): + return self.has_feature(Target.Feature_OpenMP)