From 342f797c54b9b8cbf78f1db00284c14a240e4a05 Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti <rafaelravedutti@gmail.com> Date: Tue, 24 Sep 2024 15:47:18 +0200 Subject: [PATCH] Move CUDA routines back to boundary_weights.cpp Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com> --- runtime/boundary_weights.cpp | 67 ++++++++++++++++++++++++++++++++++- runtime/devices/cuda.cu | 68 ------------------------------------ 2 files changed, 66 insertions(+), 69 deletions(-) diff --git a/runtime/boundary_weights.cpp b/runtime/boundary_weights.cpp index 35880c0..065c4cb 100644 --- a/runtime/boundary_weights.cpp +++ b/runtime/boundary_weights.cpp @@ -11,9 +11,74 @@ #include "last_generated.hpp" #ifdef PAIRS_TARGET_CUDA + +#define REDUCE_BLOCK_SIZE 64 + +void __global__ reduceBoundaryWeights( + real_t *position, int start, int end, int particle_capacity, + real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, int *d_weights) { + + __shared__ int red_data[REDUCE_BLOCK_SIZE]; + int tid = threadIdx.x; + int i = blockIdx.x * blockDim.x + tid; + int particle_idx = start + i; + + red_data[tid] = 0; + + if(particle_idx < end) { + real_t pos_x = pairs_cuda_interface::get_position(position, particle_idx, 0, particle_capacity); + real_t pos_y = pairs_cuda_interface::get_position(position, particle_idx, 1, particle_capacity); + real_t pos_z = pairs_cuda_interface::get_position(position, particle_idx, 2, particle_capacity); + + if( pos_x > xmin && pos_x <= xmax && + pos_y > ymin && pos_y <= ymax && + pos_z > zmin && pos_z <= zmax) { + red_data[tid] = 1; + } + } + + __syncthreads(); + + int s = blockDim.x >> 1; + while(s > 0) { + if(tid < s) { + red_data[tid] += red_data[tid + s]; + } + + __syncthreads(); + s >>= 1; + } + + if(tid == 0) { + d_weights[blockIdx.x] = red_data[0]; + } +} + int cuda_compute_boundary_weights( real_t *position, int start, int end, int particle_capacity, - real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax); + real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax) { + + const int nblocks = (end - start + (REDUCE_BLOCK_SIZE - 1)) / REDUCE_BLOCK_SIZE; + int *h_weights = (int *) malloc(nblocks * sizeof(int)); + int *d_weights = (int *) device_alloc(nblocks * sizeof(int)); + int red = 0; + + CUDA_ASSERT(cudaMemset(d_weights, 0, nblocks * sizeof(int))); + + reduceBoundaryWeights<<<nblocks, REDUCE_BLOCK_SIZE>>>( + position, start, end, particle_capacity, + xmin, xmax, ymin, ymax, zmin, zmax, d_weights); + + CUDA_ASSERT(cudaPeekAtLastError()); + CUDA_ASSERT(cudaDeviceSynchronize()); + CUDA_ASSERT(cudaMemcpy(h_weights, d_weights, nblocks * sizeof(int), cudaMemcpyDeviceToHost)); + + for(int i = 0; i < nblocks; i++) { + red += h_weights[i]; + } + + return red; +} #endif namespace pairs { diff --git a/runtime/devices/cuda.cu b/runtime/devices/cuda.cu index e9de4c9..db5119e 100644 --- a/runtime/devices/cuda.cu +++ b/runtime/devices/cuda.cu @@ -1,11 +1,9 @@ #include <cuda_runtime.h> #include <iostream> #include <cstring> -#include "last_generated.hpp" #include "../pairs_common.hpp" #define CUDA_ASSERT(a) { pairs::cuda_assert((a), __FILE__, __LINE__); } -#define REDUCE_BLOCK_SIZE 64 namespace pairs { @@ -75,70 +73,4 @@ __host__ void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t } -void __global__ reduceBoundaryWeights( - real_t *position, int start, int end, int particle_capacity, - real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, int *d_weights) { - - __shared__ int red_data[REDUCE_BLOCK_SIZE]; - int tid = threadIdx.x; - int i = blockIdx.x * blockDim.x + tid; - int particle_idx = start + i; - - red_data[tid] = 0; - - if(particle_idx < end) { - real_t pos_x = pairs_cuda_interface::get_position(position, particle_idx, 0, particle_capacity); - real_t pos_y = pairs_cuda_interface::get_position(position, particle_idx, 1, particle_capacity); - real_t pos_z = pairs_cuda_interface::get_position(position, particle_idx, 2, particle_capacity); - - if( pos_x > xmin && pos_x <= xmax && - pos_y > ymin && pos_y <= ymax && - pos_z > zmin && pos_z <= zmax) { - red_data[tid] = 1; - } - } - - __syncthreads(); - - int s = blockDim.x >> 1; - while(s > 0) { - if(tid < s) { - red_data[tid] += red_data[tid + s]; - } - - __syncthreads(); - s >>= 1; - } - - if(tid == 0) { - d_weights[blockIdx.x] = red_data[0]; - } -} - -int cuda_compute_boundary_weights( - real_t *position, int start, int end, int particle_capacity, - real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax) { - - const int nblocks = (end - start + (REDUCE_BLOCK_SIZE - 1)) / REDUCE_BLOCK_SIZE; - int *h_weights = (int *) malloc(nblocks * sizeof(int)); - int *d_weights = (int *) device_alloc(nblocks * sizeof(int)); - int red = 0; - - CUDA_ASSERT(cudaMemset(d_weights, 0, nblocks * sizeof(int))); - - reduceBoundaryWeights<<<nblocks, REDUCE_BLOCK_SIZE>>>( - position, start, end, particle_capacity, - xmin, xmax, ymin, ymax, zmin, zmax, d_weights); - - CUDA_ASSERT(cudaPeekAtLastError()); - CUDA_ASSERT(cudaDeviceSynchronize()); - CUDA_ASSERT(cudaMemcpy(h_weights, d_weights, nblocks * sizeof(int), cudaMemcpyDeviceToHost)); - - for(int i = 0; i < nblocks; i++) { - red += h_weights[i]; - } - - return red; -} - } -- GitLab