From 149e9a25a6c88d02cff10bd3ceb1234d64d45a97 Mon Sep 17 00:00:00 2001
From: Behzad Safaei <iwia103h@alex1.nhr.fau.de>
Date: Tue, 24 Sep 2024 13:09:48 +0200
Subject: [PATCH] last_generated.hpp gets generated and included,
 reduceBoundaryWeights bugs fixed, CUDA_ARCH has issues

---
 CMakeLists.txt                          | 31 +++++++-----
 runtime/boundary_weights.cpp            |  2 +-
 runtime/devices/cuda.cu                 | 67 ++++++++++++++-----------
 runtime/devices/device.hpp              | 33 ++++++------
 runtime/domain/ParticleDataHandling.hpp |  2 +
 runtime/interfaces/last_generated.hpp   | 27 ----------
 runtime/timers.hpp                      |  2 +
 src/pairs/code_gen/cgen.py              |  6 ++-
 8 files changed, 83 insertions(+), 87 deletions(-)
 delete mode 100644 runtime/interfaces/last_generated.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 17a08e0..32f6d0b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,9 +9,11 @@ endif()
 
 set(CUDA_ARCH ${CUDA_ARCH} CACHE STRING "CUDA_ARCH environment variable must be set.")
 
-if(NOT CUDA_ARCH)
-    set(CUDA_ARCH sm_80)
-endif()
+set(CMAKE_CUDA_ARCHITECTURES 80)
+set(CUDA_ARCH 80)
+# if(NOT CUDA_ARCH)
+#     set(CUDA_ARCH 80)
+# endif()
 
 string(TOLOWER "${TESTCASE}" TESTCASE)
 message(STATUS "Selected testcase: ${TESTCASE}")
@@ -38,6 +40,8 @@ set(CPU_SRC "${TESTCASE}.cpp")
 set(GPU_SRC "${TESTCASE}.cu")
 set(CPU_BIN "${TESTCASE}_cpu")
 set(GPU_BIN "${TESTCASE}_gpu")
+set(GEN_HEADER ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces/last_generated.hpp)
+set(GEN_HEADER_DIR ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces)
 
 set(RUNTIME_COMMON_FILES
     runtime/pairs.cpp
@@ -98,12 +102,13 @@ add_library(runtime_cpu STATIC runtime/devices/dummy.cpp)
 target_link_libraries(${CPU_BIN} runtime_cpu)
 
 add_custom_command(
-    OUTPUT ${CMAKE_BINARY_DIR}/${CPU_SRC}
+    OUTPUT ${CMAKE_BINARY_DIR}/${CPU_SRC} ${GEN_HEADER}
     COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py cpu
     COMMENT "Generate CPU code"
     DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py)
-
-add_custom_target(gen_cpu DEPENDS ${CMAKE_BINARY_DIR}/${CPU_SRC})
+    
+target_include_directories(${CPU_BIN} PRIVATE ${GEN_HEADER_DIR})
+add_custom_target(gen_cpu DEPENDS ${CMAKE_BINARY_DIR}/${CPU_SRC} ${GEN_HEADER})
 add_dependencies(${CPU_BIN} gen_cpu)
 
 if(COMPILE_CUDA)
@@ -128,21 +133,23 @@ if(COMPILE_CUDA)
     set_target_properties(runtime_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
     target_link_libraries(${GPU_BIN} runtime_gpu)
-    target_compile_options(${GPU_BIN} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=${CUDA_ARCH}>)
-    target_include_directories(${GPU_BIN} PRIVATE ${CUDA_INCLUDE_DIRS})
+    target_compile_options(${GPU_BIN} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>)
+    target_include_directories(${GPU_BIN} PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR})
     set_target_properties(${GPU_BIN} PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH})
+    target_compile_definitions(${GPU_BIN} PRIVATE PAIRS_TARGET_CUDA)
 
-    target_compile_options(runtime_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=${CUDA_ARCH}>)
-    target_include_directories(runtime_gpu PRIVATE ${CUDA_INCLUDE_DIRS})
+    target_compile_options(runtime_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>)
+    target_include_directories(runtime_gpu PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR})
     set_target_properties(runtime_gpu PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH})
+    target_compile_definitions(runtime_gpu PRIVATE PAIRS_TARGET_CUDA)
 
     add_custom_command(
-        OUTPUT ${GPU_SRC}
+        OUTPUT ${GPU_SRC} ${GEN_HEADER}
         COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py gpu
         COMMENT "Generate GPU code"
         DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py)
 
-    add_custom_target(gen_gpu DEPENDS ${GPU_SRC})
+    add_custom_target(gen_gpu DEPENDS ${GPU_SRC} ${GEN_HEADER})
     add_dependencies(${GPU_BIN} gen_gpu)
 endif()
 
diff --git a/runtime/boundary_weights.cpp b/runtime/boundary_weights.cpp
index 1a714d1..35880c0 100644
--- a/runtime/boundary_weights.cpp
+++ b/runtime/boundary_weights.cpp
@@ -8,7 +8,7 @@
 #include "pairs_common.hpp"
 
 // Always include last generated interfaces
-#include "interfaces/last_generated.hpp"
+#include "last_generated.hpp"
 
 #ifdef PAIRS_TARGET_CUDA
 int cuda_compute_boundary_weights(
diff --git a/runtime/devices/cuda.cu b/runtime/devices/cuda.cu
index 0883728..e9de4c9 100644
--- a/runtime/devices/cuda.cu
+++ b/runtime/devices/cuda.cu
@@ -1,6 +1,8 @@
 #include <cuda_runtime.h>
 #include <iostream>
 #include <cstring>
+#include "last_generated.hpp"
+#include "../pairs_common.hpp"
 
 #define CUDA_ASSERT(a) { pairs::cuda_assert((a), __FILE__, __LINE__); }
 #define REDUCE_BLOCK_SIZE   64
@@ -72,44 +74,23 @@ __host__ void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t
     //CUDA_ASSERT(cudaMemcpyFromSymbol(h_ptr, d_ptr, count));
 }
 
-int cuda_compute_boundary_weights(
-    real_t *position, int start, int end, int particle_capacity,
-    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax) {
-
-    const int nblocks = (end - start + (REDUCE_BLOCK_SIZE - 1)) / REDUCE_BLOCK_SIZE;
-    int *h_weights = (int *) malloc(nblocks * sizeof(int));
-    int *d_weights = (int *) device_alloc(nblocks * sizeof(int));
-    int red = 0;
-
-    CUDA_ASSERT(cudaMemset(d_weights, 0, nblocks * sizeof(int)));
 
-    reduceBoundaryWeights(
-        position, start, particle_capacity, 
-    )
-    CUDA_ASSERT(cudaPeekAtLastError());
-    CUDA_ASSERT(cudaDeviceSynchronize());
-    CUDA_ASSERT(cudaMemcpy(h_weights, d_weights, nblocks * sizeof(int), cudaMemcpyDeviceToHost));
-
-    reduceBoundaryWeights<nblocks, REDUCE_BLOCK_SIZE>();
-
-    for(int i = 0; i < nblocks; i++) {
-        red += h_weights[i];
-    }
-
-    return red;
-}
+void __global__ reduceBoundaryWeights(
+    real_t *position, int start, int end, int particle_capacity,
+    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, int *d_weights) {
 
-void __device__ reduceBoundaryWeights() {
     __shared__ int red_data[REDUCE_BLOCK_SIZE];
     int tid = threadIdx.x;
     int i = blockIdx.x * blockDim.x + tid;
-    real_t pos_x = pairs_cuda_interface::get_position(position, start + i, 0, particle_capacity);
-    real_t pos_y = pairs_cuda_interface::get_position(position, start + i, 1, particle_capacity);
-    real_t pos_z = pairs_cuda_interface::get_position(position, start + i, 2, particle_capacity);
+    int particle_idx = start + i;
 
     red_data[tid] = 0;
 
-    if(i < n) {
+    if(particle_idx < end) {
+        real_t pos_x = pairs_cuda_interface::get_position(position, particle_idx, 0, particle_capacity);
+        real_t pos_y = pairs_cuda_interface::get_position(position, particle_idx, 1, particle_capacity);
+        real_t pos_z = pairs_cuda_interface::get_position(position, particle_idx, 2, particle_capacity);
+
         if( pos_x > xmin && pos_x <= xmax &&
             pos_y > ymin && pos_y <= ymax &&
             pos_z > zmin && pos_z <= zmax) {
@@ -134,4 +115,30 @@ void __device__ reduceBoundaryWeights() {
     }
 }
 
+int cuda_compute_boundary_weights(
+    real_t *position, int start, int end, int particle_capacity,
+    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax) {
+
+    const int nblocks = (end - start + (REDUCE_BLOCK_SIZE - 1)) / REDUCE_BLOCK_SIZE;
+    int *h_weights = (int *) malloc(nblocks * sizeof(int));
+    int *d_weights = (int *) device_alloc(nblocks * sizeof(int));
+    int red = 0;
+
+    CUDA_ASSERT(cudaMemset(d_weights, 0, nblocks * sizeof(int)));
+
+    reduceBoundaryWeights<<<nblocks, REDUCE_BLOCK_SIZE>>>(
+            position, start, end, particle_capacity, 
+            xmin, xmax, ymin, ymax, zmin, zmax, d_weights);
+
+    CUDA_ASSERT(cudaPeekAtLastError());
+    CUDA_ASSERT(cudaDeviceSynchronize());
+    CUDA_ASSERT(cudaMemcpy(h_weights, d_weights, nblocks * sizeof(int), cudaMemcpyDeviceToHost));
+
+    for(int i = 0; i < nblocks; i++) {
+        red += h_weights[i];
+    }
+
+    return red;
+}
+
 }
diff --git a/runtime/devices/device.hpp b/runtime/devices/device.hpp
index e340f12..e2740b0 100644
--- a/runtime/devices/device.hpp
+++ b/runtime/devices/device.hpp
@@ -8,6 +8,8 @@
 #ifndef PAIRS_TARGET_CUDA
 #   define __host__
 typedef int cudaError_t;
+#else
+#include <cuda_runtime.h>
 #endif
 
 namespace pairs {
@@ -71,24 +73,25 @@ inline __host__ int host_atomic_add_resize_check(int *addr, int val, int *resize
 }
 
 #ifdef PAIRS_TARGET_CUDA
-#if __CUDA_ARCH__ < 600
-__device__ double atomicAdd_double(double* address, double val) {
-    unsigned long long int * ull_addr = (unsigned long long int*) address;
-    unsigned long long int old = *ull_addr, assumed;
-
-    do {
-        assumed = old;
-        old = atomicCAS(ull_addr, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
-    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
-    } while (assumed != old);
-
-    return __longlong_as_double(old);
-}
-#else
+// #if __CUDA_ARCH__ < 600
+// #error "CUDA architecture is less than 600"
+// __device__ double atomicAdd_double(double* address, double val) {
+//     unsigned long long int * ull_addr = (unsigned long long int*) address;
+//     unsigned long long int old = *ull_addr, assumed;
+
+//     do {
+//         assumed = old;
+//         old = atomicCAS(ull_addr, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
+//     // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+//     } while (assumed != old);
+
+//     return __longlong_as_double(old);
+// }
+// #else
 __device__ double atomicAdd_double(double* address, double val) {
     return atomicAdd(address, val);
 }
-#endif
+// #endif
 
 __device__ int atomic_add(int *addr, int val) { return atomicAdd(addr, val); }
 __device__ real_t atomic_add(real_t *addr, real_t val) { return atomicAdd_double(addr, val); }
diff --git a/runtime/domain/ParticleDataHandling.hpp b/runtime/domain/ParticleDataHandling.hpp
index f43c494..b99e8cb 100644
--- a/runtime/domain/ParticleDataHandling.hpp
+++ b/runtime/domain/ParticleDataHandling.hpp
@@ -1,6 +1,8 @@
 #include <blockforest/BlockForest.h>
 #include <blockforest/BlockDataHandling.h>
 
+#pragma once
+
 namespace pairs {
 
 class PairsRuntime;
diff --git a/runtime/interfaces/last_generated.hpp b/runtime/interfaces/last_generated.hpp
deleted file mode 100644
index c8bb6b2..0000000
--- a/runtime/interfaces/last_generated.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "../pairs.hpp"
-
-namespace pairs_host_interface {
-
-int get_uid(int *uid, int i) { return uid[i]; }
-int get_shape(int *shape, int i) { return shape[i]; }
-int get_flags(int *flags, int i) { return flags[i]; }
-double get_position(double *position, int i, int j, int capacity) { return position[i * 3 + j]; }
-double get_mass(double *mass, int i) { return mass[i]; }
-double get_linear_velocity(double *linear_velocity, int i, int j, int capacity) { return linear_velocity[i * 3 + j]; }
-double get_force(double *force, int i, int j, int capacity) { return force[i * 3 + j]; }
-int get_type(int *type, int i) { return type[i]; }
-
-}
-
-namespace pairs_cuda_interface {
-
-__inline__ __device__ int get_uid(int *uid, int i) { return uid[i]; }
-__inline__ __device__ int get_shape(int *shape, int i) { return shape[i]; }
-__inline__ __device__ int get_flags(int *flags, int i) { return flags[i]; }
-__inline__ __device__ double get_position(double *position, int i, int j, int capacity) { return position[i * 3 + j]; }
-__inline__ __device__ double get_mass(double *mass, int i) { return mass[i]; }
-__inline__ __device__ double get_linear_velocity(double *linear_velocity, int i, int j, int capacity) { return linear_velocity[i * 3 + j]; }
-__inline__ __device__ double get_force(double *force, int i, int j, int capacity) { return force[i * 3 + j]; }
-__inline__ __device__ int get_type(int *type, int i) { return type[i]; }
-
-}
diff --git a/runtime/timers.hpp b/runtime/timers.hpp
index 0205892..c4cdc94 100644
--- a/runtime/timers.hpp
+++ b/runtime/timers.hpp
@@ -3,6 +3,8 @@
 #include <iostream>
 #include <unordered_map>
 
+#pragma once
+
 using namespace std;
 
 namespace pairs {
diff --git a/src/pairs/code_gen/cgen.py b/src/pairs/code_gen/cgen.py
index 1a3103a..d4a9d75 100644
--- a/src/pairs/code_gen/cgen.py
+++ b/src/pairs/code_gen/cgen.py
@@ -84,7 +84,7 @@ class CGen:
         #self.print = Printer(f"runtime/interfaces/{self.ref}.hpp")
         self.print = Printer("runtime/interfaces/last_generated.hpp")
         self.print.start()
-        self.print("#include \"../pairs.hpp\"")
+        self.print("#pragma once")
         self.generate_interface_namespace('pairs_host_interface')
         self.generate_interface_namespace('pairs_cuda_interface', "__inline__ __device__")
         self.print.end()
@@ -226,13 +226,14 @@ class CGen:
             self.print(f"{tkw} {vname};")
 
             if self.target.is_gpu() and v.device_flag:
-                self.print(f"RuntimeVar<{tkw}> rv_{vname()};")
+                self.print(f"RuntimeVar<{tkw}> rv_{vname};")
 
         self.print.add_indent(-4)
         self.print("};")
         self.print("")
 
     def generate_program(self, ast_node):
+        self.generate_interfaces()
         ext = ".cu" if self.target.is_gpu() else ".cpp"
         self.print = Printer(self.ref + ext)
         self.print.start()
@@ -248,6 +249,7 @@ class CGen:
         self.print.end()
 
     def generate_library(self, initialize_module, create_domain_module, setup_sim_module,  do_timestep_module):
+        self.generate_interfaces()
         # Generate CUDA/CPP file with modules
         ext = ".cu" if self.target.is_gpu() else ".cpp"
         self.print = Printer(self.ref + ext)
-- 
GitLab