From 342f797c54b9b8cbf78f1db00284c14a240e4a05 Mon Sep 17 00:00:00 2001
From: Rafael Ravedutti <rafaelravedutti@gmail.com>
Date: Tue, 24 Sep 2024 15:47:18 +0200
Subject: [PATCH] Move CUDA routines back to boundary_weights.cpp

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
---
 runtime/boundary_weights.cpp | 67 ++++++++++++++++++++++++++++++++++-
 runtime/devices/cuda.cu      | 68 ------------------------------------
 2 files changed, 66 insertions(+), 69 deletions(-)

diff --git a/runtime/boundary_weights.cpp b/runtime/boundary_weights.cpp
index 35880c0..065c4cb 100644
--- a/runtime/boundary_weights.cpp
+++ b/runtime/boundary_weights.cpp
@@ -11,9 +11,74 @@
 #include "last_generated.hpp"
 
 #ifdef PAIRS_TARGET_CUDA
+
+#define REDUCE_BLOCK_SIZE   64
+
+void __global__ reduceBoundaryWeights(
+    real_t *position, int start, int end, int particle_capacity,
+    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, int *d_weights) {
+
+    __shared__ int red_data[REDUCE_BLOCK_SIZE];
+    int tid = threadIdx.x;
+    int i = blockIdx.x * blockDim.x + tid;
+    int particle_idx = start + i;
+
+    red_data[tid] = 0;
+
+    if(particle_idx < end) {
+        real_t pos_x = pairs_cuda_interface::get_position(position, particle_idx, 0, particle_capacity);
+        real_t pos_y = pairs_cuda_interface::get_position(position, particle_idx, 1, particle_capacity);
+        real_t pos_z = pairs_cuda_interface::get_position(position, particle_idx, 2, particle_capacity);
+
+        if( pos_x > xmin && pos_x <= xmax &&
+            pos_y > ymin && pos_y <= ymax &&
+            pos_z > zmin && pos_z <= zmax) {
+                red_data[tid] = 1;
+        }
+    }
+
+    __syncthreads();
+
+    int s = blockDim.x >> 1;
+    while(s > 0) {
+        if(tid < s) {
+            red_data[tid] += red_data[tid + s];
+        }
+
+        __syncthreads();
+        s >>= 1;
+    }
+
+    if(tid == 0) {
+        d_weights[blockIdx.x] = red_data[0];
+    }
+}
+
 int cuda_compute_boundary_weights(
     real_t *position, int start, int end, int particle_capacity,
-    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax);
+    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax) {
+
+    const int nblocks = (end - start + (REDUCE_BLOCK_SIZE - 1)) / REDUCE_BLOCK_SIZE;
+    int *h_weights = (int *) malloc(nblocks * sizeof(int));
+    int *d_weights = (int *) device_alloc(nblocks * sizeof(int));
+    int red = 0;
+
+    CUDA_ASSERT(cudaMemset(d_weights, 0, nblocks * sizeof(int)));
+
+    reduceBoundaryWeights<<<nblocks, REDUCE_BLOCK_SIZE>>>(
+            position, start, end, particle_capacity,
+            xmin, xmax, ymin, ymax, zmin, zmax, d_weights);
+
+    CUDA_ASSERT(cudaPeekAtLastError());
+    CUDA_ASSERT(cudaDeviceSynchronize());
+    CUDA_ASSERT(cudaMemcpy(h_weights, d_weights, nblocks * sizeof(int), cudaMemcpyDeviceToHost));
+
+    for(int i = 0; i < nblocks; i++) {
+        red += h_weights[i];
+    }
+
+    return red;
+}
 #endif
 
 namespace pairs {
diff --git a/runtime/devices/cuda.cu b/runtime/devices/cuda.cu
index e9de4c9..db5119e 100644
--- a/runtime/devices/cuda.cu
+++ b/runtime/devices/cuda.cu
@@ -1,11 +1,9 @@
 #include <cuda_runtime.h>
 #include <iostream>
 #include <cstring>
-#include "last_generated.hpp"
 #include "../pairs_common.hpp"
 
 #define CUDA_ASSERT(a) { pairs::cuda_assert((a), __FILE__, __LINE__); }
-#define REDUCE_BLOCK_SIZE   64
 
 namespace pairs {
 
@@ -75,70 +73,4 @@ __host__ void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t
 }
 
 
-void __global__ reduceBoundaryWeights(
-    real_t *position, int start, int end, int particle_capacity,
-    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, int *d_weights) {
-
-    __shared__ int red_data[REDUCE_BLOCK_SIZE];
-    int tid = threadIdx.x;
-    int i = blockIdx.x * blockDim.x + tid;
-    int particle_idx = start + i;
-
-    red_data[tid] = 0;
-
-    if(particle_idx < end) {
-        real_t pos_x = pairs_cuda_interface::get_position(position, particle_idx, 0, particle_capacity);
-        real_t pos_y = pairs_cuda_interface::get_position(position, particle_idx, 1, particle_capacity);
-        real_t pos_z = pairs_cuda_interface::get_position(position, particle_idx, 2, particle_capacity);
-
-        if( pos_x > xmin && pos_x <= xmax &&
-            pos_y > ymin && pos_y <= ymax &&
-            pos_z > zmin && pos_z <= zmax) {
-                red_data[tid] = 1;
-        }
-    }
-
-    __syncthreads();
-
-    int s = blockDim.x >> 1;
-    while(s > 0) {
-        if(tid < s) {
-            red_data[tid] += red_data[tid + s];
-        }
-
-        __syncthreads();
-        s >>= 1;
-    }
-
-    if(tid == 0) {
-        d_weights[blockIdx.x] = red_data[0];
-    }
-}
-
-int cuda_compute_boundary_weights(
-    real_t *position, int start, int end, int particle_capacity,
-    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax) {
-
-    const int nblocks = (end - start + (REDUCE_BLOCK_SIZE - 1)) / REDUCE_BLOCK_SIZE;
-    int *h_weights = (int *) malloc(nblocks * sizeof(int));
-    int *d_weights = (int *) device_alloc(nblocks * sizeof(int));
-    int red = 0;
-
-    CUDA_ASSERT(cudaMemset(d_weights, 0, nblocks * sizeof(int)));
-
-    reduceBoundaryWeights<<<nblocks, REDUCE_BLOCK_SIZE>>>(
-            position, start, end, particle_capacity, 
-            xmin, xmax, ymin, ymax, zmin, zmax, d_weights);
-
-    CUDA_ASSERT(cudaPeekAtLastError());
-    CUDA_ASSERT(cudaDeviceSynchronize());
-    CUDA_ASSERT(cudaMemcpy(h_weights, d_weights, nblocks * sizeof(int), cudaMemcpyDeviceToHost));
-
-    for(int i = 0; i < nblocks; i++) {
-        red += h_weights[i];
-    }
-
-    return red;
-}
-
 }
-- 
GitLab