diff --git a/CMakeLists.txt b/CMakeLists.txt index 17a08e09013f43a0b304f0c973bfc4651a380ebb..32f6d0b872bec9f1824d81eda3021796ebc0d457 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,9 +9,11 @@ endif() set(CUDA_ARCH ${CUDA_ARCH} CACHE STRING "CUDA_ARCH environment variable must be set.") -if(NOT CUDA_ARCH) - set(CUDA_ARCH sm_80) -endif() +set(CMAKE_CUDA_ARCHITECTURES 80) +set(CUDA_ARCH 80) +# if(NOT CUDA_ARCH) +# set(CUDA_ARCH 80) +# endif() string(TOLOWER "${TESTCASE}" TESTCASE) message(STATUS "Selected testcase: ${TESTCASE}") @@ -38,6 +40,8 @@ set(CPU_SRC "${TESTCASE}.cpp") set(GPU_SRC "${TESTCASE}.cu") set(CPU_BIN "${TESTCASE}_cpu") set(GPU_BIN "${TESTCASE}_gpu") +set(GEN_HEADER ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces/last_generated.hpp) +set(GEN_HEADER_DIR ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces) set(RUNTIME_COMMON_FILES runtime/pairs.cpp @@ -98,12 +102,13 @@ add_library(runtime_cpu STATIC runtime/devices/dummy.cpp) target_link_libraries(${CPU_BIN} runtime_cpu) add_custom_command( - OUTPUT ${CMAKE_BINARY_DIR}/${CPU_SRC} + OUTPUT ${CMAKE_BINARY_DIR}/${CPU_SRC} ${GEN_HEADER} COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py cpu COMMENT "Generate CPU code" DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py) - -add_custom_target(gen_cpu DEPENDS ${CMAKE_BINARY_DIR}/${CPU_SRC}) + +target_include_directories(${CPU_BIN} PRIVATE ${GEN_HEADER_DIR}) +add_custom_target(gen_cpu DEPENDS ${CMAKE_BINARY_DIR}/${CPU_SRC} ${GEN_HEADER}) add_dependencies(${CPU_BIN} gen_cpu) if(COMPILE_CUDA) @@ -128,21 +133,23 @@ if(COMPILE_CUDA) set_target_properties(runtime_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON) target_link_libraries(${GPU_BIN} runtime_gpu) - target_compile_options(${GPU_BIN} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=${CUDA_ARCH}>) - target_include_directories(${GPU_BIN} PRIVATE ${CUDA_INCLUDE_DIRS}) + target_compile_options(${GPU_BIN} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>) + target_include_directories(${GPU_BIN} PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR}) set_target_properties(${GPU_BIN} PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH}) + target_compile_definitions(${GPU_BIN} PRIVATE PAIRS_TARGET_CUDA) - target_compile_options(runtime_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=${CUDA_ARCH}>) - target_include_directories(runtime_gpu PRIVATE ${CUDA_INCLUDE_DIRS}) + target_compile_options(runtime_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>) + target_include_directories(runtime_gpu PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR}) set_target_properties(runtime_gpu PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH}) + target_compile_definitions(runtime_gpu PRIVATE PAIRS_TARGET_CUDA) add_custom_command( - OUTPUT ${GPU_SRC} + OUTPUT ${GPU_SRC} ${GEN_HEADER} COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py gpu COMMENT "Generate GPU code" DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py) - add_custom_target(gen_gpu DEPENDS ${GPU_SRC}) + add_custom_target(gen_gpu DEPENDS ${GPU_SRC} ${GEN_HEADER}) add_dependencies(${GPU_BIN} gen_gpu) endif() diff --git a/runtime/boundary_weights.cpp b/runtime/boundary_weights.cpp index 1a714d17794f9e5d89612b4034ee605106e466a3..35880c0be9edfe1407fffcfc02865ac281cce881 100644 --- a/runtime/boundary_weights.cpp +++ b/runtime/boundary_weights.cpp @@ -8,7 +8,7 @@ #include "pairs_common.hpp" // Always include last generated interfaces -#include "interfaces/last_generated.hpp" +#include "last_generated.hpp" #ifdef PAIRS_TARGET_CUDA int cuda_compute_boundary_weights( diff --git a/runtime/devices/cuda.cu b/runtime/devices/cuda.cu index 0883728550bf5c764e760fa2a4b05e026e76dfab..e9de4c9ff93ece8a84a2234200b4d330725279da 100644 --- a/runtime/devices/cuda.cu +++ b/runtime/devices/cuda.cu @@ -1,6 +1,8 @@ #include <cuda_runtime.h> #include <iostream> #include <cstring> +#include "last_generated.hpp" +#include "../pairs_common.hpp" #define CUDA_ASSERT(a) { pairs::cuda_assert((a), __FILE__, __LINE__); } #define REDUCE_BLOCK_SIZE 64 @@ -72,44 +74,23 @@ __host__ void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t //CUDA_ASSERT(cudaMemcpyFromSymbol(h_ptr, d_ptr, count)); } -int cuda_compute_boundary_weights( - real_t *position, int start, int end, int particle_capacity, - real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax) { - - const int nblocks = (end - start + (REDUCE_BLOCK_SIZE - 1)) / REDUCE_BLOCK_SIZE; - int *h_weights = (int *) malloc(nblocks * sizeof(int)); - int *d_weights = (int *) device_alloc(nblocks * sizeof(int)); - int red = 0; - - CUDA_ASSERT(cudaMemset(d_weights, 0, nblocks * sizeof(int))); - reduceBoundaryWeights( - position, start, particle_capacity, - ) - CUDA_ASSERT(cudaPeekAtLastError()); - CUDA_ASSERT(cudaDeviceSynchronize()); - CUDA_ASSERT(cudaMemcpy(h_weights, d_weights, nblocks * sizeof(int), cudaMemcpyDeviceToHost)); - - reduceBoundaryWeights<nblocks, REDUCE_BLOCK_SIZE>(); - - for(int i = 0; i < nblocks; i++) { - red += h_weights[i]; - } - - return red; -} +void __global__ reduceBoundaryWeights( + real_t *position, int start, int end, int particle_capacity, + real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, int *d_weights) { -void __device__ reduceBoundaryWeights() { __shared__ int red_data[REDUCE_BLOCK_SIZE]; int tid = threadIdx.x; int i = blockIdx.x * blockDim.x + tid; - real_t pos_x = pairs_cuda_interface::get_position(position, start + i, 0, particle_capacity); - real_t pos_y = pairs_cuda_interface::get_position(position, start + i, 1, particle_capacity); - real_t pos_z = pairs_cuda_interface::get_position(position, start + i, 2, particle_capacity); + int particle_idx = start + i; red_data[tid] = 0; - if(i < n) { + if(particle_idx < end) { + real_t pos_x = pairs_cuda_interface::get_position(position, particle_idx, 0, particle_capacity); + real_t pos_y = pairs_cuda_interface::get_position(position, particle_idx, 1, particle_capacity); + real_t pos_z = pairs_cuda_interface::get_position(position, particle_idx, 2, particle_capacity); + if( pos_x > xmin && pos_x <= xmax && pos_y > ymin && pos_y <= ymax && pos_z > zmin && pos_z <= zmax) { @@ -134,4 +115,30 @@ void __device__ reduceBoundaryWeights() { } } +int cuda_compute_boundary_weights( + real_t *position, int start, int end, int particle_capacity, + real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax) { + + const int nblocks = (end - start + (REDUCE_BLOCK_SIZE - 1)) / REDUCE_BLOCK_SIZE; + int *h_weights = (int *) malloc(nblocks * sizeof(int)); + int *d_weights = (int *) device_alloc(nblocks * sizeof(int)); + int red = 0; + + CUDA_ASSERT(cudaMemset(d_weights, 0, nblocks * sizeof(int))); + + reduceBoundaryWeights<<<nblocks, REDUCE_BLOCK_SIZE>>>( + position, start, end, particle_capacity, + xmin, xmax, ymin, ymax, zmin, zmax, d_weights); + + CUDA_ASSERT(cudaPeekAtLastError()); + CUDA_ASSERT(cudaDeviceSynchronize()); + CUDA_ASSERT(cudaMemcpy(h_weights, d_weights, nblocks * sizeof(int), cudaMemcpyDeviceToHost)); + + for(int i = 0; i < nblocks; i++) { + red += h_weights[i]; + } + + return red; +} + } diff --git a/runtime/devices/device.hpp b/runtime/devices/device.hpp index e340f12ba4276ac1b7d4e5d67fda245acb115d18..e2740b0c47631ac6f4f78fa9d144bbaa79fabd4f 100644 --- a/runtime/devices/device.hpp +++ b/runtime/devices/device.hpp @@ -8,6 +8,8 @@ #ifndef PAIRS_TARGET_CUDA # define __host__ typedef int cudaError_t; +#else +#include <cuda_runtime.h> #endif namespace pairs { @@ -71,24 +73,25 @@ inline __host__ int host_atomic_add_resize_check(int *addr, int val, int *resize } #ifdef PAIRS_TARGET_CUDA -#if __CUDA_ARCH__ < 600 -__device__ double atomicAdd_double(double* address, double val) { - unsigned long long int * ull_addr = (unsigned long long int*) address; - unsigned long long int old = *ull_addr, assumed; - - do { - assumed = old; - old = atomicCAS(ull_addr, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); - // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) - } while (assumed != old); - - return __longlong_as_double(old); -} -#else +// #if __CUDA_ARCH__ < 600 +// #error "CUDA architecture is less than 600" +// __device__ double atomicAdd_double(double* address, double val) { +// unsigned long long int * ull_addr = (unsigned long long int*) address; +// unsigned long long int old = *ull_addr, assumed; + +// do { +// assumed = old; +// old = atomicCAS(ull_addr, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); +// // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) +// } while (assumed != old); + +// return __longlong_as_double(old); +// } +// #else __device__ double atomicAdd_double(double* address, double val) { return atomicAdd(address, val); } -#endif +// #endif __device__ int atomic_add(int *addr, int val) { return atomicAdd(addr, val); } __device__ real_t atomic_add(real_t *addr, real_t val) { return atomicAdd_double(addr, val); } diff --git a/runtime/domain/ParticleDataHandling.hpp b/runtime/domain/ParticleDataHandling.hpp index f43c494593a06aebeac543f03fd195dd7817f0e7..b99e8cbde5fff5ad78898b277f4462b52062e9f7 100644 --- a/runtime/domain/ParticleDataHandling.hpp +++ b/runtime/domain/ParticleDataHandling.hpp @@ -1,6 +1,8 @@ #include <blockforest/BlockForest.h> #include <blockforest/BlockDataHandling.h> +#pragma once + namespace pairs { class PairsRuntime; diff --git a/runtime/interfaces/last_generated.hpp b/runtime/interfaces/last_generated.hpp deleted file mode 100644 index c8bb6b29da34073cb58e82a8b54dc5d0f832f3ce..0000000000000000000000000000000000000000 --- a/runtime/interfaces/last_generated.hpp +++ /dev/null @@ -1,27 +0,0 @@ -#include "../pairs.hpp" - -namespace pairs_host_interface { - -int get_uid(int *uid, int i) { return uid[i]; } -int get_shape(int *shape, int i) { return shape[i]; } -int get_flags(int *flags, int i) { return flags[i]; } -double get_position(double *position, int i, int j, int capacity) { return position[i * 3 + j]; } -double get_mass(double *mass, int i) { return mass[i]; } -double get_linear_velocity(double *linear_velocity, int i, int j, int capacity) { return linear_velocity[i * 3 + j]; } -double get_force(double *force, int i, int j, int capacity) { return force[i * 3 + j]; } -int get_type(int *type, int i) { return type[i]; } - -} - -namespace pairs_cuda_interface { - -__inline__ __device__ int get_uid(int *uid, int i) { return uid[i]; } -__inline__ __device__ int get_shape(int *shape, int i) { return shape[i]; } -__inline__ __device__ int get_flags(int *flags, int i) { return flags[i]; } -__inline__ __device__ double get_position(double *position, int i, int j, int capacity) { return position[i * 3 + j]; } -__inline__ __device__ double get_mass(double *mass, int i) { return mass[i]; } -__inline__ __device__ double get_linear_velocity(double *linear_velocity, int i, int j, int capacity) { return linear_velocity[i * 3 + j]; } -__inline__ __device__ double get_force(double *force, int i, int j, int capacity) { return force[i * 3 + j]; } -__inline__ __device__ int get_type(int *type, int i) { return type[i]; } - -} diff --git a/runtime/timers.hpp b/runtime/timers.hpp index 02058924ada457213214511a4099a05cf87695ac..c4cdc943aa5faeed57b4971684277e0844d3e7da 100644 --- a/runtime/timers.hpp +++ b/runtime/timers.hpp @@ -3,6 +3,8 @@ #include <iostream> #include <unordered_map> +#pragma once + using namespace std; namespace pairs { diff --git a/src/pairs/code_gen/cgen.py b/src/pairs/code_gen/cgen.py index 1a3103a40f8cad7936003fe0fb35781edd328ecf..d4a9d757850a74e01a9d488d12b1284c3bfa7754 100644 --- a/src/pairs/code_gen/cgen.py +++ b/src/pairs/code_gen/cgen.py @@ -84,7 +84,7 @@ class CGen: #self.print = Printer(f"runtime/interfaces/{self.ref}.hpp") self.print = Printer("runtime/interfaces/last_generated.hpp") self.print.start() - self.print("#include \"../pairs.hpp\"") + self.print("#pragma once") self.generate_interface_namespace('pairs_host_interface') self.generate_interface_namespace('pairs_cuda_interface', "__inline__ __device__") self.print.end() @@ -226,13 +226,14 @@ class CGen: self.print(f"{tkw} {vname};") if self.target.is_gpu() and v.device_flag: - self.print(f"RuntimeVar<{tkw}> rv_{vname()};") + self.print(f"RuntimeVar<{tkw}> rv_{vname};") self.print.add_indent(-4) self.print("};") self.print("") def generate_program(self, ast_node): + self.generate_interfaces() ext = ".cu" if self.target.is_gpu() else ".cpp" self.print = Printer(self.ref + ext) self.print.start() @@ -248,6 +249,7 @@ class CGen: self.print.end() def generate_library(self, initialize_module, create_domain_module, setup_sim_module, do_timestep_module): + self.generate_interfaces() # Generate CUDA/CPP file with modules ext = ".cu" if self.target.is_gpu() else ".cpp" self.print = Printer(self.ref + ext)