diff --git a/runtime/boundary_weights.hpp b/runtime/boundary_weights.hpp index 6f625474a24d626889bd43ceb37aeff03552b359..f784d04fbbb378fc349c7d88f0fb9ea67d0c6e57 100644 --- a/runtime/boundary_weights.hpp +++ b/runtime/boundary_weights.hpp @@ -15,6 +15,12 @@ #pragma once +#ifdef PAIRS_TARGET_CUDA +int cuda_compute_boundary_weights( + real_t *position, int start, int end, int particle_capacity, + real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax); +#endif + namespace pairs { void compute_boundary_weights( @@ -27,15 +33,16 @@ void compute_boundary_weights( const int nghost = ps->getNumberOfGhostParticles(); auto position_prop = ps->getPropertyByName("position"); - //ps->copyPropertyToDevice(position_prop, Ignore); + #ifndef PAIRS_TARGET_CUDA + auto position_ptr = position_prop->getHostPointer(); *comp_weight = 0; *comm_weight = 0; for(int i = 0; i < nlocal; i++) { - real_t pos_x = pairs_interface::get_position(position_ptr, i, 0, particle_capacity); - real_t pos_y = pairs_interface::get_position(position_ptr, i, 1, particle_capacity); - real_t pos_z = pairs_interface::get_position(position_ptr, i, 2, particle_capacity); + real_t pos_x = pairs_host_interface::get_position(position_ptr, i, 0, particle_capacity); + real_t pos_y = pairs_host_interface::get_position(position_ptr, i, 1, particle_capacity); + real_t pos_z = pairs_host_interface::get_position(position_ptr, i, 2, particle_capacity); if( pos_x > xmin && pos_x <= xmax && pos_y > ymin && pos_y <= ymax && @@ -45,9 +52,9 @@ void compute_boundary_weights( } for(int i = nlocal; i < nlocal + nghost; i++) { - real_t pos_x = pairs_interface::get_position(position_ptr, i, 0, particle_capacity); - real_t pos_y = pairs_interface::get_position(position_ptr, i, 1, particle_capacity); - real_t pos_z = pairs_interface::get_position(position_ptr, i, 2, particle_capacity); + real_t pos_x = pairs_host_interface::get_position(position_ptr, i, 0, particle_capacity); + real_t pos_y = pairs_host_interface::get_position(position_ptr, i, 1, particle_capacity); + real_t pos_z = pairs_host_interface::get_position(position_ptr, i, 2, particle_capacity); if( pos_x > xmin && pos_x <= xmax && pos_y > ymin && pos_y <= ymax && @@ -55,6 +62,17 @@ void compute_boundary_weights( *comm_weight++; } } + #else + auto position_ptr = position_prop->getDevicePointer(); + + ps->copyPropertyToDevice(position_prop, ReadOnly); + + *comp_weight = cuda_compute_boundary_weights( + position_ptr, 0, nlocal, particle_capacity, xmin, xmax, ymin, ymax, zmin, zmax); + + *comm_weight = cuda_compute_boundary_weights( + position_ptr, nlocal, nlocal + nghost, particle_capacity, xmin, xmax, ymin, ymax, zmin, zmax); + #endif } } diff --git a/runtime/devices/cuda.cu b/runtime/devices/cuda.cu index 8bb7c59ef3bbefff0556847669faeae57744372e..0883728550bf5c764e760fa2a4b05e026e76dfab 100644 --- a/runtime/devices/cuda.cu +++ b/runtime/devices/cuda.cu @@ -3,6 +3,7 @@ #include <cstring> #define CUDA_ASSERT(a) { pairs::cuda_assert((a), __FILE__, __LINE__); } +#define REDUCE_BLOCK_SIZE 64 namespace pairs { @@ -71,4 +72,66 @@ __host__ void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t //CUDA_ASSERT(cudaMemcpyFromSymbol(h_ptr, d_ptr, count)); } +int cuda_compute_boundary_weights( + real_t *position, int start, int end, int particle_capacity, + real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax) { + + const int nblocks = (end - start + (REDUCE_BLOCK_SIZE - 1)) / REDUCE_BLOCK_SIZE; + int *h_weights = (int *) malloc(nblocks * sizeof(int)); + int *d_weights = (int *) device_alloc(nblocks * sizeof(int)); + int red = 0; + + CUDA_ASSERT(cudaMemset(d_weights, 0, nblocks * sizeof(int))); + + reduceBoundaryWeights( + position, start, particle_capacity, + ) + CUDA_ASSERT(cudaPeekAtLastError()); + CUDA_ASSERT(cudaDeviceSynchronize()); + CUDA_ASSERT(cudaMemcpy(h_weights, d_weights, nblocks * sizeof(int), cudaMemcpyDeviceToHost)); + + reduceBoundaryWeights<nblocks, REDUCE_BLOCK_SIZE>(); + + for(int i = 0; i < nblocks; i++) { + red += h_weights[i]; + } + + return red; +} + +void __device__ reduceBoundaryWeights() { + __shared__ int red_data[REDUCE_BLOCK_SIZE]; + int tid = threadIdx.x; + int i = blockIdx.x * blockDim.x + tid; + real_t pos_x = pairs_cuda_interface::get_position(position, start + i, 0, particle_capacity); + real_t pos_y = pairs_cuda_interface::get_position(position, start + i, 1, particle_capacity); + real_t pos_z = pairs_cuda_interface::get_position(position, start + i, 2, particle_capacity); + + red_data[tid] = 0; + + if(i < n) { + if( pos_x > xmin && pos_x <= xmax && + pos_y > ymin && pos_y <= ymax && + pos_z > zmin && pos_z <= zmax) { + red_data[tid] = 1; + } + } + + __syncthreads(); + + int s = blockDim.x >> 1; + while(s > 0) { + if(tid < s) { + red_data[tid] += red_data[tid + s]; + } + + __syncthreads(); + s >>= 1; + } + + if(tid == 0) { + d_weights[blockIdx.x] = red_data[0]; + } +} + } diff --git a/runtime/domain/ParticleDataHanding.hpp b/runtime/domain/ParticleDataHanding.hpp index 879b9ae83a943c94f7482de0d5c3abb0d89d7907..b36cd0e0db416a8e38d828efd29dd5e6efb5d657 100644 --- a/runtime/domain/ParticleDataHanding.hpp +++ b/runtime/domain/ParticleDataHanding.hpp @@ -74,7 +74,6 @@ private: void serializeImpl(Block *const block, const BlockDataID&, mpi::SendBuffer& buffer, const uint_t child, bool check_child) { auto ptr = buffer.allocate<uint_t>(); double aabb_check[6]; - int nparticles; if(check_child) { const auto child_id = BlockID(block->getId(), child); @@ -95,28 +94,130 @@ private: aabb_check[5] = aabb.zMax(); } - nparticles = md_serialize_particles(aabb_check); + for(auto& p: ps->getNonVolatileProperties()) { + ps->copyPropertyToHost(p, ReadOnly); + } - for(int i = 0; i < nparticles * 7; ++i) { - buffer << md_get_send_buffer_value(i); + auto position = ps->getPropertyByName("position"); + int nlocal = ps->getNumberOfLocalParticles(); + int i = 0; + int nserialized = 0; + + while(i < nlocal) { + const real_t pos_x = position(i, 0); + const real_t pos_y = position(i, 1); + const real_t pos_z = position(i, 2); + + if( pos_x > aabb_check[0] && pos_x <= aabb_check[1] && + pos_y > aabb_check[2] && pos_y <= aabb_check[3] && + pos_z > aabb_check[4] && pos_z <= aabb_check[5]) { + + nlocal--; + + for(auto &p: ps->getNonVolatileProperties()) { + auto prop = ps->getProperty(p_id); + auto prop_type = prop.getType(); + + if(prop_type == Prop_Vector) { + auto vector_ptr = ps->getAsVectorProperty(prop); + constexpr int nelems = 3; + + for(int e = 0; e < nelems; e++) { + buffer << vector_ptr(i, e); + vector_ptr(i, e) = vector_ptr(nlocal, e); + } + } else if(prop_type == Prop_Matrix) { + auto matrix_ptr = ps->getAsMatrixProperty(prop); + constexpr int nelems = 9; + + for(int e = 0; e < nelems; e++) { + buffer << matrix_ptr(i, e); + matrix_ptr(i, e) = matrix_ptr(nlocal, e); + } + } else if(prop_type == Prop_Quaternion) { + auto quat_ptr = ps->getAsQuaternionProperty(prop); + constexpr int nelems = 4; + + for(int e = 0; e < nelems; e++) { + buffer << quat_ptr(i, e); + quat_ptr(i, e) = quat_ptr(nlocal, e); + } + } else if(prop_type == Prop_Integer) { + auto int_ptr = ps->getAsIntegerProperty(prop); + buffer << int_ptr(i); + int_ptr(i) = int_ptr(nlocal); + } else if(prop_type == Prop_Real) { + auto float_ptr = ps->getAsFloatProperty(prop); + buffer << float_ptr(i); + float_ptr(i) = float_ptr(nlocal); + } else { + std::cerr << "serializeImpl(): Invalid property type!" << std::endl; + return 0; + } + } + + // TODO: serialize contact history data as well + nserialized++; + } } - *ptr = (uint_t) nparticles; + ps->setNumberOfLocalParticles(nlocal); + *ptr = (uint_t) nserialized; } void deserializeImpl(IBlock *const, const BlockDataID&, mpi::RecvBuffer& buffer) { - uint_t nparticles; - buffer >> nparticles; - - md_resize_recv_buffer_capacity((int) nparticles); + int nlocal = ps->getNumberOfLocalParticles(); + uint_t nrecv; + + buffer >> nrecv; + + // TODO: Check if there is enough particle capacity for the new particles + // md_resize_recv_buffer_capacity((int) nparticles); + + for(int i = 0; i < nrecv; ++i) { + for(auto &p: ps->getNonVolatileProperties()) { + auto prop = ps->getProperty(p_id); + auto prop_type = prop.getType(); + + if(prop_type == Prop_Vector) { + auto vector_ptr = ps->getAsVectorProperty(prop); + constexpr int nelems = 3; + + for(int e = 0; e < nelems; e++) { + buffer >> vector_ptr(nlocal + i, e); + } + } else if(prop_type == Prop_Matrix) { + auto matrix_ptr = ps->getAsMatrixProperty(prop); + constexpr int nelems = 9; + + for(int e = 0; e < nelems; e++) { + buffer >> matrix_ptr(nlocal + i, e); + } + } else if(prop_type == Prop_Quaternion) { + auto quat_ptr = ps->getAsQuaternionProperty(prop); + constexpr int nelems = 4; + + for(int e = 0; e < nelems; e++) { + buffer >> quat_ptr(nlocal + i, e); + } + } else if(prop_type == Prop_Integer) { + auto int_ptr = ps->getAsIntegerProperty(prop); + buffer >> int_ptr(nlocal + i); + } else if(prop_type == Prop_Real) { + auto float_ptr = ps->getAsFloatProperty(prop); + buffer >> float_ptr(nlocal + i); + } else { + std::cerr << "deserializeImpl(): Invalid property type!" << std::endl; + return 0; + } + } + } - for(int i = 0; i < (int) nparticles * 7; ++i) { - double v; - buffer >> v; - md_set_recv_buffer_value(i, v); + for(auto& p: ps->getNonVolatileProperties()) { + ps->clearDeviceFlags(p); } - md_deserialize_particles((int) nparticles); + ps->setNumberOfLocalParticles(nlocal + nrecv); } }; diff --git a/runtime/domain/block_forest.cpp b/runtime/domain/block_forest.cpp index 44d30197437bef211a450f6e6ebfc59d0ff0e04d..02a8fca8204d551e912c4112f99deb2ed71f724c 100644 --- a/runtime/domain/block_forest.cpp +++ b/runtime/domain/block_forest.cpp @@ -77,8 +77,6 @@ void BlockForest::updateNeighborhood() { } } - - void BlockForest::copyRuntimeArray(const std::string& name, void *dest, const int size) { void *src = name.compare('ranks') ? ranks.data() : name.compare('naabbs') ? vec_naabbs.data() : @@ -265,7 +263,8 @@ void BlockForest::initializeWorkloadBalancer() { forest->allowMultipleRefreshCycles(false); forest->checkForEarlyOutInRefresh(false); forest->checkForLateOutInRefresh(false); - forest->setRefreshMinTargetLevelDeterminationFunction(pe::amr::MinMaxLevelDetermination(info, regridMin, regridMax)); + forest->setRefreshMinTargetLevelDeterminationFunction( + pe::amr::MinMaxLevelDetermination(info, regridMin, regridMax)); for_each(algorithm.begin(), algorithm.end(), [](char& c) { c = (char) ::tolower(c); });