Skip to content
Snippets Groups Projects
Commit 342f797c authored by Rafael Ravedutti's avatar Rafael Ravedutti
Browse files

Move CUDA routines back to boundary_weights.cpp

parent 149e9a25
Branches
No related tags found
No related merge requests found
......@@ -11,9 +11,74 @@
#include "last_generated.hpp"
#ifdef PAIRS_TARGET_CUDA
#define REDUCE_BLOCK_SIZE 64
void __global__ reduceBoundaryWeights(
real_t *position, int start, int end, int particle_capacity,
real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, int *d_weights) {
__shared__ int red_data[REDUCE_BLOCK_SIZE];
int tid = threadIdx.x;
int i = blockIdx.x * blockDim.x + tid;
int particle_idx = start + i;
red_data[tid] = 0;
if(particle_idx < end) {
real_t pos_x = pairs_cuda_interface::get_position(position, particle_idx, 0, particle_capacity);
real_t pos_y = pairs_cuda_interface::get_position(position, particle_idx, 1, particle_capacity);
real_t pos_z = pairs_cuda_interface::get_position(position, particle_idx, 2, particle_capacity);
if( pos_x > xmin && pos_x <= xmax &&
pos_y > ymin && pos_y <= ymax &&
pos_z > zmin && pos_z <= zmax) {
red_data[tid] = 1;
}
}
__syncthreads();
int s = blockDim.x >> 1;
while(s > 0) {
if(tid < s) {
red_data[tid] += red_data[tid + s];
}
__syncthreads();
s >>= 1;
}
if(tid == 0) {
d_weights[blockIdx.x] = red_data[0];
}
}
int cuda_compute_boundary_weights(
real_t *position, int start, int end, int particle_capacity,
real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax);
real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax) {
const int nblocks = (end - start + (REDUCE_BLOCK_SIZE - 1)) / REDUCE_BLOCK_SIZE;
int *h_weights = (int *) malloc(nblocks * sizeof(int));
int *d_weights = (int *) device_alloc(nblocks * sizeof(int));
int red = 0;
CUDA_ASSERT(cudaMemset(d_weights, 0, nblocks * sizeof(int)));
reduceBoundaryWeights<<<nblocks, REDUCE_BLOCK_SIZE>>>(
position, start, end, particle_capacity,
xmin, xmax, ymin, ymax, zmin, zmax, d_weights);
CUDA_ASSERT(cudaPeekAtLastError());
CUDA_ASSERT(cudaDeviceSynchronize());
CUDA_ASSERT(cudaMemcpy(h_weights, d_weights, nblocks * sizeof(int), cudaMemcpyDeviceToHost));
for(int i = 0; i < nblocks; i++) {
red += h_weights[i];
}
return red;
}
#endif
namespace pairs {
......
#include <cuda_runtime.h>
#include <iostream>
#include <cstring>
#include "last_generated.hpp"
#include "../pairs_common.hpp"
#define CUDA_ASSERT(a) { pairs::cuda_assert((a), __FILE__, __LINE__); }
#define REDUCE_BLOCK_SIZE 64
namespace pairs {
......@@ -75,70 +73,4 @@ __host__ void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t
}
void __global__ reduceBoundaryWeights(
real_t *position, int start, int end, int particle_capacity,
real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, int *d_weights) {
__shared__ int red_data[REDUCE_BLOCK_SIZE];
int tid = threadIdx.x;
int i = blockIdx.x * blockDim.x + tid;
int particle_idx = start + i;
red_data[tid] = 0;
if(particle_idx < end) {
real_t pos_x = pairs_cuda_interface::get_position(position, particle_idx, 0, particle_capacity);
real_t pos_y = pairs_cuda_interface::get_position(position, particle_idx, 1, particle_capacity);
real_t pos_z = pairs_cuda_interface::get_position(position, particle_idx, 2, particle_capacity);
if( pos_x > xmin && pos_x <= xmax &&
pos_y > ymin && pos_y <= ymax &&
pos_z > zmin && pos_z <= zmax) {
red_data[tid] = 1;
}
}
__syncthreads();
int s = blockDim.x >> 1;
while(s > 0) {
if(tid < s) {
red_data[tid] += red_data[tid + s];
}
__syncthreads();
s >>= 1;
}
if(tid == 0) {
d_weights[blockIdx.x] = red_data[0];
}
}
int cuda_compute_boundary_weights(
real_t *position, int start, int end, int particle_capacity,
real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax) {
const int nblocks = (end - start + (REDUCE_BLOCK_SIZE - 1)) / REDUCE_BLOCK_SIZE;
int *h_weights = (int *) malloc(nblocks * sizeof(int));
int *d_weights = (int *) device_alloc(nblocks * sizeof(int));
int red = 0;
CUDA_ASSERT(cudaMemset(d_weights, 0, nblocks * sizeof(int)));
reduceBoundaryWeights<<<nblocks, REDUCE_BLOCK_SIZE>>>(
position, start, end, particle_capacity,
xmin, xmax, ymin, ymax, zmin, zmax, d_weights);
CUDA_ASSERT(cudaPeekAtLastError());
CUDA_ASSERT(cudaDeviceSynchronize());
CUDA_ASSERT(cudaMemcpy(h_weights, d_weights, nblocks * sizeof(int), cudaMemcpyDeviceToHost));
for(int i = 0; i < nblocks; i++) {
red += h_weights[i];
}
return red;
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment