diff --git a/CMakeLists.txt b/CMakeLists.txt index e5e64f58adc9305ca50a519910012e440bf6a805..b7fd6a678cd6af86a50c3e3daf7fa029936af84e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,7 @@ option(ENABLE_GPU_DIRECT "ENABLE_GPU_DIRECT" ON) option(GENERATE_WHOLE_PROGRAM "GENERATE_WHOLE_PROGRAM" OFF) if(USE_WALBERLA) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_WALBERLA_LOAD_BALANCING ") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_WALBERLA_LOAD_BALANCING -DUSE_WALBERLA") find_package(waLBerla REQUIRED) add_subdirectory(${walberla_SOURCE_DIR} ${walberla_BINARY_DIR} EXCLUDE_FROM_ALL) waLBerla_import() @@ -39,7 +39,6 @@ set(GEN_HEADER_DIR ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces) set(RUNTIME_COMMON_FILES runtime/pairs.cpp - # runtime/boundary_weights.cpp # avoid compiling this for now. TODO: generate the host and device functions runtime/copper_fcc_lattice.cpp runtime/create_body.cpp runtime/dem_sc_grid.cpp @@ -49,8 +48,9 @@ set(RUNTIME_COMMON_FILES runtime/timing.cpp runtime/vtk.cpp runtime/domain/regular_6d_stencil.cpp) - + set(RUNTIME_WALBERLA_FILES + # runtime/boundary_weights.cpp # avoid compiling this for now. TODO: generate the host and device functions runtime/domain/block_forest.cpp) set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE} CACHE STRING "Python executable not set.") diff --git a/examples/dem_sd.py b/examples/dem_sd.py index 5bea2fd80c594e2da27cf99393804b9d5cd40c86..d1a78d946e990dec7f4e55d3b8878efef3b65077 100644 --- a/examples/dem_sd.py +++ b/examples/dem_sd.py @@ -115,8 +115,8 @@ psim.add_property('linear_velocity', pairs.vector()) psim.add_property('angular_velocity', pairs.vector()) psim.add_property('force', pairs.vector(), volatile=True) psim.add_property('torque', pairs.vector(), volatile=True) -# psim.add_property('hydrodynamic_force', pairs.vector()) -# psim.add_property('hydrodynamic_torque', pairs.vector()) +psim.add_property('hydrodynamic_force', pairs.vector(), reduce=True) +psim.add_property('hydrodynamic_torque', pairs.vector(), reduce=True) # psim.add_property('old_hydrodynamic_force', pairs.vector()) # psim.add_property('old_hydrodynamic_torque', pairs.vector()) psim.add_property('radius', pairs.real(), 1.0) @@ -130,12 +130,12 @@ psim.add_feature_property('type', 'friction_dynamic', pairs.real(), [frictionDyn psim.set_domain([0.0, 0.0, 0.0, domainSize_SI[0], domainSize_SI[1], domainSize_SI[2]]) # psim.set_domain_partitioner(pairs.block_forest(), initDomainFromWalberla=True) -# psim.set_domain_partitioner(pairs.block_forest()) -psim.set_domain_partitioner(pairs.regular_domain_partitioner_xy()) +psim.set_domain_partitioner(pairs.block_forest()) +# psim.set_domain_partitioner(pairs.regular_domain_partitioner()) psim.pbc([False, False, False]) -psim.dem_sc_grid( - domainSize_SI[0], domainSize_SI[1], domainSize_SI[2], generationSpacing_SI, - diameter_SI, minDiameter_SI, maxDiameter_SI, initialVelocity_SI, densityParticle_SI, ntypes) +# psim.dem_sc_grid( +# domainSize_SI[0], domainSize_SI[1], domainSize_SI[2], generationSpacing_SI, +# diameter_SI, minDiameter_SI, maxDiameter_SI, initialVelocity_SI, densityParticle_SI, ntypes) #psim.read_particle_data( # "data/spheres.input", diff --git a/examples/main.cpp b/examples/main.cpp index a663ba8157b13cfb42610a4cdf3c8db59570ac30..5f8473b5e5c7fee202399084ef44b92ec3851c8a 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -1,41 +1,32 @@ #include <iostream> //--- #include "dem_sd.hpp" -// #include "core/mpi/Broadcast.h" -// #include <blockforest/BlockForest.h> -// #include <blockforest/Initialization.h> -// #include <blockforest/loadbalancing/DynamicCurve.h> -// #include <blockforest/loadbalancing/DynamicDiffusive.h> -// #include <blockforest/loadbalancing/DynamicParMetis.h> -// #include <blockforest/loadbalancing/InfoCollection.h> -// #include <blockforest/loadbalancing/PODPhantomData.h> -// #include <blockforest/loadbalancing/level_determination/MinMaxLevelDetermination.h> -// #include <blockforest/loadbalancing/weight_assignment/MetisAssignmentFunctor.h> -// #include <blockforest/loadbalancing/weight_assignment/WeightAssignmentFunctor.h> +#include <blockforest/BlockForest.h> +#include <blockforest/Initialization.h> int main(int argc, char **argv) { auto pairs_sim = std::make_shared<PairsSimulation>(); - // auto pairs_acc = std::make_shared<PairsAccessor>(pairs_sim); + auto pairs_acc = std::make_shared<PairsAccessor>(pairs_sim); // Create forest (make sure to use_domain(forest)) ---------------------------------------------- - // walberla::math::AABB domain(0, 0, 0, 0.1, 0.1, 0.1); - // std::shared_ptr<walberla::mpi::MPIManager> mpiManager = walberla::mpi::MPIManager::instance(); - // mpiManager->initializeMPI(&argc, &argv); - // mpiManager->useWorldComm(); - // auto rank = mpiManager->rank(); - // auto procs = mpiManager->numProcesses(); - // auto block_config = walberla::Vector3<int>(2, 2, 1); - // auto ref_level = 0; - // std::shared_ptr<walberla::BlockForest> forest = walberla::blockforest::createBlockForest( - // domain, block_config, walberla::Vector3<bool>(false, false, false), procs, ref_level); + walberla::math::AABB domain(0, 0, 0, 0.1, 0.1, 0.1); + std::shared_ptr<walberla::mpi::MPIManager> mpiManager = walberla::mpi::MPIManager::instance(); + mpiManager->initializeMPI(&argc, &argv); + mpiManager->useWorldComm(); + auto rank = mpiManager->rank(); + auto procs = mpiManager->numProcesses(); + auto block_config = walberla::Vector3<int>(2, 2, 2); + auto ref_level = 0; + std::shared_ptr<walberla::BlockForest> forest = walberla::blockforest::createBlockForest( + domain, block_config, walberla::Vector3<bool>(false, false, false), procs, ref_level); //----------------------------------------------------------------------------------------------- // initialize pairs data structures ---------------------------------------------- pairs_sim->initialize(); // either create new domain or use an existing one ---------------------------------------- - pairs_sim->create_domain(argc, argv); - // pairs_sim->use_domain(forest); + // pairs_sim->create_domain(argc, argv); + pairs_sim->use_domain(forest); // create planes and particles ------------------------------------------------------------ pairs_sim->create_halfspace(0, 0, 0, 1, 0, 0, 0, 13); @@ -45,21 +36,23 @@ int main(int argc, char **argv) { pairs_sim->create_halfspace(0.1, 0.1, 0.1, 0, -1, 0, 0, 13); pairs_sim->create_halfspace(0.1, 0.1, 0.1, 0, 0, -1, 0, 13); - pairs::id_t pUid = pairs_sim->create_sphere(0.03, 0.03, 0.08, 0.5, 0.5, 0 , 1000, 0.0045, 0, 0); - pairs_sim->create_sphere(0.07, 0.07, 0.08, -0.5, -0.5, 0 , 1000, 0.0045, 0, 0); + pairs::id_t pUid = pairs_sim->create_sphere(0.0499, 0.0499, 0.07, 0.5, 0.5, 0 , 1000, 0.0045, 0, 0); + pairs::id_t pUid2 = pairs_sim->create_sphere(0.0499, 0.0499, 0.0499, 0.5, 0.5, 0 , 1000, 0.0045, 0, 0); // Tracking a particle ------------------------------------------------------------------------ // if (pUid != pairs_acc->getInvalidUid()){ // std::cout<< "Particle " << pUid << " is created in rank " << rank << std::endl; // } - // walberla::mpi::allReduceInplace(pUid, walberla::mpi::SUM); + walberla::mpi::allReduceInplace(pUid, walberla::mpi::SUM); + walberla::mpi::allReduceInplace(pUid2, walberla::mpi::SUM); // if (pUid != pairs_acc->getInvalidUid()){ // std::cout<< "Particle " << pUid << " will be tracked by rank " << rank << std::endl; // } - // auto pIsInMyRank = [&](pairs::id_t uid){return pairs_acc->uidToIdx(uid) != pairs_acc->getInvalidIdx();}; + auto pIsLocalInMyRank = [&](pairs::id_t uid){return pairs_acc->uidToIdx(uid) != pairs_acc->getInvalidIdx();}; + auto pIsGhostInMyRank = [&](pairs::id_t uid){return pairs_acc->uidToIdxGhost(uid) != pairs_acc->getInvalidIdx();}; // TODO: make sure linkedCellWidth is larger than max diameter in the system @@ -75,19 +68,59 @@ int main(int argc, char **argv) { // } // } - // if (pIsInMyRank(pUid)){ + // if (pIsLocalInMyRank(pUid)){ // int idx = pairs_acc->uidToIdx(pUid); // pairs_acc->setPosition(idx, walberla::Vector3<double>(0.01, 0.01, 0.08)); // pairs_acc->setLinearVelocity(idx, walberla::Vector3<double>(0.5, 0.5, 0.5)); // } - for (int i=0; i<10000; ++i){ - // if ((i%200==0) && (pIsInMyRank(pUid))){ + for (int t=0; t<1; ++t){ + // if ((t%200==0) && (pIsLocalInMyRank(pUid))){ // int idx = pairs_acc->uidToIdx(pUid); // std::cout<< "Tracked particle is now in rank " << rank << " --- " << pairs_acc->getPosition(idx)<< std::endl; // } - - pairs_sim->do_timestep(i); + + if (pIsLocalInMyRank(pUid)){ + int idx = pairs_acc->uidToIdx(pUid); + pairs_acc->setHydrodynamicForce(idx, walberla::Vector3<double>(1, 1, 1)); + pairs_acc->setHydrodynamicTorque(idx, walberla::Vector3<double>(2, 2, 2)); + } + if (pIsLocalInMyRank(pUid2)){ + int idx = pairs_acc->uidToIdx(pUid2); + pairs_acc->setHydrodynamicForce(idx, walberla::Vector3<double>(10, 10, 10)); + pairs_acc->setHydrodynamicTorque(idx, walberla::Vector3<double>(20, 20, 20)); + } + + pairs_sim->do_timestep(t); + + if (pIsGhostInMyRank(pUid)){ + int idx = pairs_acc->uidToIdxGhost(pUid); + pairs_acc->setHydrodynamicForce(idx, walberla::Vector3<double>(1, 1, 1)); + pairs_acc->setHydrodynamicTorque(idx, walberla::Vector3<double>(3, 3, 3)); + } + if (pIsGhostInMyRank(pUid2)){ + int idx = pairs_acc->uidToIdxGhost(pUid2); + pairs_acc->setHydrodynamicForce(idx, walberla::Vector3<double>(2, 2, 2)); + pairs_acc->setHydrodynamicTorque(idx, walberla::Vector3<double>(4, 4, 4)); + } + + std::cout << "reverse comm and reduce" << std::endl; + pairs_sim->reverse_comm(); + + if (pIsLocalInMyRank(pUid)){ + int idx = pairs_acc->uidToIdx(pUid); + auto forceSum = pairs_acc->getHydrodynamicForce(idx); + auto torqueSum = pairs_acc->getHydrodynamicTorque(idx); + std::cout << pUid << " @@@@ reduced force = " << forceSum << std::endl; + std::cout << pUid << " @@@@ reduced torque = " << torqueSum << std::endl; + } + if (pIsLocalInMyRank(pUid2)){ + int idx = pairs_acc->uidToIdx(pUid2); + auto forceSum = pairs_acc->getHydrodynamicForce(idx); + auto torqueSum = pairs_acc->getHydrodynamicTorque(idx); + std::cout << pUid2 << " @@@@ reduced force = " << forceSum << std::endl; + std::cout << pUid2 << " @@@@ reduced torque = " << torqueSum << std::endl; + } } pairs_sim->end(); diff --git a/runtime/boundary_weights.cpp b/runtime/boundary_weights.cpp index f2c84f8a10869b87e99794a0eaca61b6283c05b7..2174da809af1e7f83cf3fe47adc4d2280494ca7e 100644 --- a/runtime/boundary_weights.cpp +++ b/runtime/boundary_weights.cpp @@ -8,13 +8,13 @@ #include "pairs_common.hpp" // Always include last generated interfaces -#include "last_generated.hpp" +#include "interfaces/last_generated.hpp" #ifdef PAIRS_TARGET_CUDA #define REDUCE_BLOCK_SIZE 64 -void __global__ reduceBoundaryWeights( +__global__ void reduceBoundaryWeights( real_t *position, int start, int end, int particle_capacity, real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, int *d_weights) { diff --git a/runtime/boundary_weights.hpp b/runtime/boundary_weights.hpp index cd079d139052f82a98315e9ccfdfdd8d19130ab4..bb310add32dd93ddb892c1838b6e136ae25123f9 100644 --- a/runtime/boundary_weights.hpp +++ b/runtime/boundary_weights.hpp @@ -15,9 +15,11 @@ namespace pairs { void compute_boundary_weights( PairsRuntime *ps, real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, - long unsigned int *comp_weight, long unsigned int *comm_weight){ + long unsigned int *comp_weight, long unsigned int *comm_weight) + { std::cerr<< "TODO: boundary weights should be generated" << std::endl; exit(-1); - }; + } + ; } diff --git a/runtime/domain/ParticleDataHandling.hpp b/runtime/domain/ParticleDataHandling.hpp index e9cc12d2863208d2f3146f3b129baafcf5821e8a..f66bb70434d793e0dee825436e0f4d3096193d2b 100644 --- a/runtime/domain/ParticleDataHandling.hpp +++ b/runtime/domain/ParticleDataHandling.hpp @@ -191,7 +191,7 @@ public: // TODO: Check if there is enough particle capacity for the new particles, when there is not, // all properties and arrays which have particle_capacity as one of their dimensions must be reallocated - PAIRS_ASSERT(nlocal + nrecv < particle_capacity); + // PAIRS_ASSERT(nlocal + nrecv < particle_capacity); for(int i = 0; i < nrecv; ++i) { for(auto &prop: ps->getProperties()) { diff --git a/runtime/domain/block_forest.cpp b/runtime/domain/block_forest.cpp index fcb19886f2f253027501fd5c621be4019401804d..fcc532917744dbaf474de43b9b65acd018b4c762 100644 --- a/runtime/domain/block_forest.cpp +++ b/runtime/domain/block_forest.cpp @@ -68,7 +68,8 @@ void BlockForest::updateNeighborhood() { auto block = static_cast<walberla::blockforest::Block *>(&iblock); auto& block_info = (*info)[block->getId()]; - if(block_info.computationalWeight > 0) { + // don't check computationalWeight for now (TODO: compute_boundary_weights) + // if(block_info.computationalWeight > 0) { for(uint neigh = 0; neigh < block->getNeighborhoodSize(); ++neigh) { auto neighbor_rank = walberla::int_c(block->getNeighborProcess(neigh)); @@ -79,8 +80,8 @@ void BlockForest::updateNeighborhood() { auto begin = blocks_pushed[neighbor_rank].begin(); auto end = blocks_pushed[neighbor_rank].end(); - if(neighbor_info.computationalWeight > 0 && - find_if(begin, end, [neighbor_block](const auto &nbh) { + // if(neighbor_info.computationalWeight > 0 && + if( find_if(begin, end, [neighbor_block](const auto &nbh) { return nbh == neighbor_block; }) == end) { neighborhood[neighbor_rank].push_back(neighbor_aabb); @@ -88,7 +89,7 @@ void BlockForest::updateNeighborhood() { } // } } - } + // } } for(auto& nbh: neighborhood) { @@ -133,21 +134,21 @@ void BlockForest::updateWeights() { auto block = static_cast<walberla::blockforest::Block *>(&iblock); auto aabb = block->getAABB(); auto& block_info = (*info)[block->getId()]; - - pairs::compute_boundary_weights( - this->ps, - aabb.xMin(), aabb.xMax(), aabb.yMin(), aabb.yMax(), aabb.zMin(), aabb.zMax(), - &(block_info.computationalWeight), &(block_info.communicationWeight)); + // TODO: Generate boundary weights + // pairs::compute_boundary_weights( + // this->ps, + // aabb.xMin(), aabb.xMax(), aabb.yMin(), aabb.yMax(), aabb.zMin(), aabb.zMax(), + // &(block_info.computationalWeight), &(block_info.communicationWeight)); for(int branch = 0; branch < 8; ++branch) { const auto b_id = walberla::BlockID(block->getId(), branch); const auto b_aabb = forest->getAABBFromBlockId(b_id); auto& b_info = (*info)[b_id]; - pairs::compute_boundary_weights( - this->ps, - b_aabb.xMin(), b_aabb.xMax(), b_aabb.yMin(), b_aabb.yMax(), b_aabb.zMin(), b_aabb.zMax(), - &(b_info.computationalWeight), &(b_info.communicationWeight)); + // pairs::compute_boundary_weights( + // this->ps, + // b_aabb.xMin(), b_aabb.xMax(), b_aabb.yMin(), b_aabb.yMax(), b_aabb.zMin(), b_aabb.zMax(), + // &(b_info.computationalWeight), &(b_info.communicationWeight)); } } @@ -270,6 +271,9 @@ void BlockForest::initialize(int *argc, char ***argv) { auto procs = mpiManager->numProcesses(); auto block_config = balance_workload ? walberla::Vector3<int>(1, 1, 1) : getBlockConfig(procs, gridsize[0], gridsize[1], gridsize[2]); + + if(rank==0) std::cout << "block_config = " << block_config << std::endl; + auto ref_level = balance_workload ? getInitialRefinementLevel(procs) : 0; forest = walberla::blockforest::createBlockForest( @@ -452,6 +456,14 @@ void BlockForest::communicateData( } } +void BlockForest::communicateDataReverse( + int dim, int elem_size, + const real_t *send_buf, const int *send_offsets, const int *nsend, + real_t *recv_buf, const int *recv_offsets, const int *nrecv) { + + this->communicateData(dim, elem_size,send_buf, send_offsets, nsend, recv_buf, recv_offsets, nrecv); +} + void BlockForest::communicateAllData( int ndims, int elem_size, const real_t *send_buf, const int *send_offsets, const int *nsend, diff --git a/runtime/domain/block_forest.hpp b/runtime/domain/block_forest.hpp index d607616995f4af4b41f3840b1e86fbb6a7294367..1c7bf537901f0681b2ddab336f39dafda1abd61a 100644 --- a/runtime/domain/block_forest.hpp +++ b/runtime/domain/block_forest.hpp @@ -79,6 +79,11 @@ public: const real_t *send_buf, const int *send_offsets, const int *nsend, real_t *recv_buf, const int *recv_offsets, const int *nrecv); + void communicateDataReverse( + int dim, int elem_size, + const real_t *send_buf, const int *send_offsets, const int *nsend, + real_t *recv_buf, const int *recv_offsets, const int *nrecv); + void communicateAllData( int ndims, int elem_size, const real_t *send_buf, const int *send_offsets, const int *nsend, diff --git a/runtime/domain/domain_partitioning.hpp b/runtime/domain/domain_partitioning.hpp index 77af9b8569e8dafbc0015974ab3fb6ec0383d635..15c56f3a509e784fa88bf2c9bc45abe926fd5916 100644 --- a/runtime/domain/domain_partitioning.hpp +++ b/runtime/domain/domain_partitioning.hpp @@ -50,10 +50,17 @@ public: int dim, int elem_size, const real_t *send_buf, const int *send_offsets, const int *nsend, real_t *recv_buf, const int *recv_offsets, const int *nrecv) = 0; + + virtual void communicateDataReverse( + int dim, int elem_size, + const real_t *send_buf, const int *send_offsets, const int *nsend, + real_t *recv_buf, const int *recv_offsets, const int *nrecv) = 0; + virtual void communicateAllData( int ndims, int elem_size, const real_t *send_buf, const int *send_offsets, const int *nsend, real_t *recv_buf, const int *recv_offsets, const int *nrecv) = 0; + virtual void finalize() = 0; }; diff --git a/runtime/domain/regular_6d_stencil.cpp b/runtime/domain/regular_6d_stencil.cpp index 649e18624899655dc5730d27e51dc5cc15d80918..6da8b4cb18518b4deff9fccf89308d65f84c85d5 100644 --- a/runtime/domain/regular_6d_stencil.cpp +++ b/runtime/domain/regular_6d_stencil.cpp @@ -167,6 +167,35 @@ void Regular6DStencil::communicateData( } } +void Regular6DStencil::communicateDataReverse( + int dim, int elem_size, + const real_t *send_buf, const int *send_offsets, const int *nsend, + real_t *recv_buf, const int *recv_offsets, const int *nrecv) { + + const real_t *send_prev = &send_buf[send_offsets[dim * 2 + 0] * elem_size]; + const real_t *send_next = &send_buf[send_offsets[dim * 2 + 1] * elem_size]; + real_t *recv_prev = &recv_buf[recv_offsets[dim * 2 + 0] * elem_size]; + real_t *recv_next = &recv_buf[recv_offsets[dim * 2 + 1] * elem_size]; + + if(prev[dim] != rank) { + MPI_Sendrecv( + send_prev, nsend[dim * 2 + 0] * elem_size, MPI_DOUBLE, next[dim], 0, + recv_prev, nrecv[dim * 2 + 0] * elem_size, MPI_DOUBLE, prev[dim], 0, + MPI_COMM_WORLD, MPI_STATUS_IGNORE); + } else { + pairs::copy_in_device(recv_prev, send_prev, nsend[dim * 2 + 0] * elem_size * sizeof(real_t)); + } + + if(next[dim] != rank) { + MPI_Sendrecv( + send_next, nsend[dim * 2 + 1] * elem_size, MPI_DOUBLE, prev[dim], 0, + recv_next, nrecv[dim * 2 + 1] * elem_size, MPI_DOUBLE, next[dim], 0, + MPI_COMM_WORLD, MPI_STATUS_IGNORE); + } else { + pairs::copy_in_device(recv_next, send_next, nsend[dim * 2 + 1] * elem_size * sizeof(real_t)); + } +} + void Regular6DStencil::communicateAllData( int ndims, int elem_size, const real_t *send_buf, const int *send_offsets, const int *nsend, diff --git a/runtime/domain/regular_6d_stencil.hpp b/runtime/domain/regular_6d_stencil.hpp index 5b91d7a53c73f75edb4671deb9dba9069b68a9c6..7ed18677451710577fb49421fdcaaf83507e8ff7 100644 --- a/runtime/domain/regular_6d_stencil.hpp +++ b/runtime/domain/regular_6d_stencil.hpp @@ -67,6 +67,11 @@ public: const real_t *send_buf, const int *send_offsets, const int *nsend, real_t *recv_buf, const int *recv_offsets, const int *nrecv); + void communicateDataReverse( + int dim, int elem_size, + const real_t *send_buf, const int *send_offsets, const int *nsend, + real_t *recv_buf, const int *recv_offsets, const int *nrecv); + void communicateAllData( int ndims, int elem_size, const real_t *send_buf, const int *send_offsets, const int *nsend, diff --git a/runtime/pairs.cpp b/runtime/pairs.cpp index a0b767c4322f84bea978243c3febe3440dbba344..40c1381bdab04b269345b5d126040420aaed9653 100644 --- a/runtime/pairs.cpp +++ b/runtime/pairs.cpp @@ -43,7 +43,7 @@ void PairsRuntime::initDomain( #endif else { - PAIRS_ERROR("Domain partitioning type not implemented!\n"); + PAIRS_ERROR("(initDomain) Domain partitioning type not implemented!\n"); exit(-1); } @@ -426,7 +426,7 @@ void PairsRuntime::communicateData( nrecv_all += nrecv[n]; } } - + copyArrayToHost(send_buf_id, Ignore, nsend_all * elem_size * sizeof(real_t)); array_flags->setHostFlag(recv_buf_id); array_flags->clearDeviceFlag(recv_buf_id); @@ -446,6 +446,69 @@ void PairsRuntime::communicateData( #endif } +void PairsRuntime::communicateDataReverse( + int dim, int elem_size, + const real_t *send_buf, const int *send_offsets, const int *nsend, + real_t *recv_buf, const int *recv_offsets, const int *nrecv) { + + const real_t *send_buf_ptr = send_buf; + real_t *recv_buf_ptr = recv_buf; + auto send_buf_array = getArrayByHostPointer(send_buf); + auto recv_buf_array = getArrayByHostPointer(recv_buf); + auto send_buf_id = send_buf_array.getId(); + auto recv_buf_id = recv_buf_array.getId(); + auto send_offsets_id = getArrayByHostPointer(send_offsets).getId(); + auto recv_offsets_id = getArrayByHostPointer(recv_offsets).getId(); + auto nsend_id = getArrayByHostPointer(nsend).getId(); + auto nrecv_id = getArrayByHostPointer(nrecv).getId(); + + this->getTimers()->start(DeviceTransfers); + copyArrayToHost(send_offsets_id, ReadOnly); + copyArrayToHost(recv_offsets_id, ReadOnly); + copyArrayToHost(nsend_id, ReadOnly); + copyArrayToHost(nrecv_id, ReadOnly); + + #ifdef ENABLE_CUDA_AWARE_MPI + send_buf_ptr = (real_t *) send_buf_array.getDevicePointer(); + recv_buf_ptr = (real_t *) recv_buf_array.getDevicePointer(); + #else + int nsend_all = 0; + int nrecv_all = 0; + if(this->dom_part_type == RegularPartitioning || this->dom_part_type == RegularXYPartitioning){ + for(int d = 2; d >= dim; d--) { + nsend_all += nsend[d * 2 + 0]; + nsend_all += nsend[d * 2 + 1]; + nrecv_all += nrecv[d * 2 + 0]; + nrecv_all += nrecv[d * 2 + 1]; + } + } + else if (this->dom_part_type == BlockForestPartitioning){ + int nranks = this->getDomainPartitioner()->getNumberOfNeighborRanks(); + for (int n=0; n<nranks; ++n){ // blockforest doesn't need reverse loop + nsend_all += nsend[n]; + nrecv_all += nrecv[n]; + } + } + + copyArrayToHost(send_buf_id, Ignore, nsend_all * elem_size * sizeof(real_t)); + array_flags->setHostFlag(recv_buf_id); + array_flags->clearDeviceFlag(recv_buf_id); + #endif + + this->getTimers()->stop(DeviceTransfers); + + this->getTimers()->start(Communication); + this->getDomainPartitioner()->communicateDataReverse( + dim, elem_size, send_buf_ptr, send_offsets, nsend, recv_buf_ptr, recv_offsets, nrecv); + this->getTimers()->stop(Communication); + + #ifndef ENABLE_CUDA_AWARE_MPI + this->getTimers()->start(DeviceTransfers); + copyArrayToDevice(recv_buf_id, Ignore, nrecv_all * elem_size * sizeof(real_t)); + this->getTimers()->stop(DeviceTransfers); + #endif +} + void PairsRuntime::communicateAllData( int ndims, int elem_size, const real_t *send_buf, const int *send_offsets, const int *nsend, diff --git a/runtime/pairs.hpp b/runtime/pairs.hpp index 68fb2d68a3cb77a8548b17517442fad7b62ab970..56d4022e471dbb40733f88a1fbf5065c0c0cce16 100644 --- a/runtime/pairs.hpp +++ b/runtime/pairs.hpp @@ -322,6 +322,11 @@ public: const real_t *send_buf, const int *send_offsets, const int *nsend, real_t *recv_buf, const int *recv_offsets, const int *nrecv); + void communicateDataReverse( + int dim, int elem_size, + const real_t *send_buf, const int *send_offsets, const int *nsend, + real_t *recv_buf, const int *recv_offsets, const int *nrecv); + void communicateAllData( int ndims, int elem_size, const real_t *send_buf, const int *send_offsets, const int *nsend, diff --git a/src/pairs/analysis/devices.py b/src/pairs/analysis/devices.py index d4552eba7183ab9a5371cb69e6bc6e07b22ed592..5a15a13ea57ba34ce0f019ec0f202e040e5bd396 100644 --- a/src/pairs/analysis/devices.py +++ b/src/pairs/analysis/devices.py @@ -27,7 +27,7 @@ class MarkCandidateLoops(Visitor): if isinstance(branch_stmt, For): possible_candidates.append(branch_stmt) - if isinstance(stmt, For): + if isinstance(stmt, For) and not stmt.not_kernel: possible_candidates.append(stmt) for stmt in possible_candidates: diff --git a/src/pairs/code_gen/cgen.py b/src/pairs/code_gen/cgen.py index 8af9bbadca2c22537df913a3fcee31c771f7d8ba..9eef2d33dd7b6702075db8f619733505eb928857 100644 --- a/src/pairs/code_gen/cgen.py +++ b/src/pairs/code_gen/cgen.py @@ -172,7 +172,8 @@ class CGen: self.print("// Module headers") for module in self.sim.modules(): - self.print(f"void {module.name}(PairsRuntime *pairs_runtime, struct PairsObjects *pobj);") + if module.name != "main": + self.print(f"void {module.name}(PairsRuntime *pairs_runtime, struct PairsObjects *pobj);") self.print("") @@ -188,7 +189,13 @@ class CGen: self.print.add_indent(4) self.print("PairsAccessor(const std::shared_ptr<PairsSimulation> &ps_): ps(ps_){}") self.print("") - self.print("int size() const { return ps->pobj->nlocal; }") + self.print("int size() const { return ps->pobj->nlocal + ps->pobj->nghost; }") + self.print("") + + self.print("int nlocal() const { return ps->pobj->nlocal; }") + self.print("") + + self.print("int nghost() const { return ps->pobj->nghost; }") self.print("") self.print("int getInvalidIdx(){return -1;}") @@ -199,7 +206,19 @@ class CGen: self.print('''int uidToIdx(pairs::id_t uid){ int idx = getInvalidIdx(); - for(int i=0; i<ps->pobj->nlocal; ++i){ + for(int i=0; i<this->nlocal(); ++i){ + if (getUid(i) == uid){ + idx = i; + break; + } + } + return idx;''') + self.print("}") + self.print("") + + self.print('''int uidToIdxGhost(pairs::id_t uid){ + int idx = getInvalidIdx(); + for(int i=this->nlocal(); i<this->size(); ++i){ if (getUid(i) == uid){ idx = i; break; @@ -337,6 +356,7 @@ class CGen: self.print.start() self.generate_preamble() self.generate_pairs_object_structure() + self.generate_module_headers() for kernel in self.sim.kernels(): self.generate_kernel(kernel) @@ -346,7 +366,7 @@ class CGen: self.print.end() - def generate_library(self, initialize_module, create_domain_module, setup_sim_module, do_timestep_module): + def generate_library(self, initialize_module, create_domain_module, setup_sim_module, do_timestep_module, reverse_comm_module): self.generate_interfaces() # Generate CUDA/CPP file with modules ext = ".cu" if self.target.is_gpu() else ".cpp" @@ -374,7 +394,7 @@ class CGen: self.generate_kernel(kernel) for module in self.sim.modules(): - if module.name not in ['initialize', 'create_domain', 'setup_sim', 'do_timestep']: + if module.name not in ['initialize', 'create_domain', 'setup_sim', 'do_timestep', 'reverse_comm']: self.generate_module(module) self.print.end() @@ -436,6 +456,11 @@ class CGen: self.print("}") self.print("") + self.print("int rank(){") + self.print(" return pairs_runtime->getDomainPartitioner()->getRank();") + self.print("}") + self.print("") + self.print("void setup_sim() {") self.print.add_indent(4) self.generate_statement(setup_sim_module.block) @@ -451,6 +476,13 @@ class CGen: self.print("}") self.print("") + self.print("void reverse_comm() {") + self.print.add_indent(4) + self.generate_statement(reverse_comm_module.block) + self.print.add_indent(-4) + self.print("}") + self.print("") + self.print("void end() {") self.print(" pairs::print_timers(pairs_runtime);") self.print(" pairs::print_stats(pairs_runtime, pobj->nlocal, pobj->nghost);") @@ -461,7 +493,7 @@ class CGen: self.print.add_indent(-4) self.print("};") - # self.generate_host_pairs_accessor_class() + self.generate_host_pairs_accessor_class() self.print.end() self.generate_full_object_names = False @@ -512,17 +544,16 @@ class CGen: if device_cond and var.device_flag: self.print(f"{type_kw} *{var.name()} = pobj->rv_{var.name()}.getDevicePointer();") + elif var.force_read: + self.print(f"{type_kw} {var.name()} = pobj->{var.name()};") else: self.print(f"{type_kw} *{var.name()} = &(pobj->{var.name()});") for array in module.arrays(): type_kw = Types.c_keyword(self.sim, array.type()) name = array.name() if not device_cond else f"d_{array.name()}" - # self.generate_full_object_names = True if not array.is_static() or (array.is_static() and not device_cond): self.print(f"{type_kw} *{array.name()} = pobj->{name};") - # self.print(f"{type_kw} *{array.name()} = {self.generate_object_reference(array, device=device_cond)};") - # self.generate_full_object_names = False if array in module.host_references(): self.print(f"{type_kw} *h_{array.name()} = pobj->{array.name()};") @@ -615,13 +646,13 @@ class CGen: self.print.add_indent(4) self.kernel_context = True - if has_resizes: - self.print(f"printf(\"{kernel.name} @@@@@@@@ before kernel: resizes[0] = %d\\n\", resizes[0]);") + # if has_resizes: + # self.print(f"printf(\"{kernel.name} @@@@@@@@ before kernel: resizes[0] = %d\\n\", resizes[0]);") self.generate_statement(kernel.block) - if has_resizes: - self.print(f"printf(\"{kernel.name} @@@@@@@@ after kernel: resizes[0] = %d\\n\", resizes[0]);") + # if has_resizes: + # self.print(f"printf(\"{kernel.name} @@@@@@@@ after kernel: resizes[0] = %d\\n\", resizes[0]);") self.kernel_context = False self.print.add_indent(-4) @@ -663,9 +694,9 @@ class CGen: if ast_node.check_for_resize(): resize = self.generate_expression(ast_node.resize) capacity = self.generate_expression(ast_node.capacity) - self.print(f"printf (\" %d -- before AtomicInc: nsend = %d -- send_capacity = %d -- resizes[0] = %d\\n\", {Printer.line_id}, {elem}, {capacity}, {resize});") + # self.print(f"printf (\" %d -- before AtomicInc: nsend = %d -- send_capacity = %d -- resizes[0] = %d\\n\", {Printer.line_id}, {elem}, {capacity}, {resize});") self.print(f"pairs::{prefix}atomic_add_resize_check(&({elem}), {value}, &({resize}), {capacity});") - self.print(f"printf (\" %d -- after AtomicInc: nsend = %d -- send_capacity = %d -- resizes[0] = %d\\n\", {Printer.line_id}, {elem}, {capacity}, {resize});") + # self.print(f"printf (\" %d -- after AtomicInc: nsend = %d -- send_capacity = %d -- resizes[0] = %d\\n\", {Printer.line_id}, {elem}, {capacity}, {resize});") else: self.print(f"pairs::{prefix}atomic_add(&({elem}), {value});") @@ -866,9 +897,9 @@ class CGen: self.print(f"pairs_runtime->copyArrayTo{ctx_suffix}({array_id}, {action}, {size}); // {array_name}") else: - self.print(f"std::cout<< \"{Printer.line_id} -- before {array_name} copyArrayTo{ctx_suffix}({action}) === \" << pobj->{array_name}[0] << \" \" << pobj->{array_name}[1] << \" \" << pobj->{array_name}[2] << std::endl;") + # self.print(f"std::cout<< \"{Printer.line_id} -- before {array_name} copyArrayTo{ctx_suffix}({action}) === \" << pobj->{array_name}[0] << \" \" << pobj->{array_name}[1] << \" \" << pobj->{array_name}[2] << std::endl;") self.print(f"pairs_runtime->copyArrayTo{ctx_suffix}({array_id}, {action}); // {array_name}") - self.print(f"std::cout<< \"{Printer.line_id} -- after {array_name} copyArrayTo{ctx_suffix}({action}) === \" << pobj->{array_name}[0] << \" \" << pobj->{array_name}[1] << \" \" << pobj->{array_name}[2] << std::endl;") + # self.print(f"std::cout<< \"{Printer.line_id} -- after {array_name} copyArrayTo{ctx_suffix}({action}) === \" << pobj->{array_name}[0] << \" \" << pobj->{array_name}[1] << \" \" << pobj->{array_name}[2] << std::endl;") if isinstance(ast_node, CopyContactProperty): prop_id = ast_node.contact_prop().id() @@ -1151,7 +1182,7 @@ class CGen: var = self.generate_expression(ast_node.var) # Dereferences are ignored for write variables when full objects # are generated since they can be directly written into - return var if self.generate_full_object_names else f"(*{var})" + return var if (self.generate_full_object_names or ast_node.var.force_read) else f"(*{var})" if isinstance(ast_node, DeviceStaticRef): elem = self.generate_expression(ast_node.elem) diff --git a/src/pairs/ir/loops.py b/src/pairs/ir/loops.py index 997fda5c6fa73938a74c8fb52f453caa1bd60a21..228bd40e9a69edd70ef36b33dacf82a8aba34fb0 100644 --- a/src/pairs/ir/loops.py +++ b/src/pairs/ir/loops.py @@ -39,7 +39,7 @@ class Iter(ASTTerm): class For(ASTNode): - def __init__(self, sim, range_min, range_max, block=None): + def __init__(self, sim, range_min, range_max, block=None, not_kernel=False): super().__init__(sim) self.iterator = Iter(sim, self) self.min = Lit.cvt(sim, range_min) @@ -47,6 +47,7 @@ class For(ASTNode): self.block = Block(sim, []) if block is None else block self.kernel = None self._kernel_candidate = False + self.not_kernel = not_kernel def __str__(self): return f"For<{self.iterator}, {self.min} ... {self.max}>" diff --git a/src/pairs/ir/properties.py b/src/pairs/ir/properties.py index b13d6f48e8face187f328130a95a14eb1ff0717d..21ed08c918529f509047c0075a2670a5922af5b0 100644 --- a/src/pairs/ir/properties.py +++ b/src/pairs/ir/properties.py @@ -16,8 +16,8 @@ class Properties: self.props = [] self.defs = {} - def add(self, p_name, p_type, p_value, p_volatile, p_layout=Layouts.AoS): - p = Property(self.sim, p_name, p_type, p_value, p_volatile, p_layout) + def add(self, p_name, p_type, p_value, p_volatile, p_layout=Layouts.AoS, p_reduce=False): + p = Property(self.sim, p_name, p_type, p_value, p_volatile, p_layout, p_reduce) self.props.append(p) self.defs[p_name] = p_value return p @@ -27,6 +27,9 @@ class Properties: def all(self): return self.props + + def reduction_props(self): + return [p for p in self.props if p.reduce is True] def volatiles(self): return [p for p in self.props if p.volatile is True] @@ -51,7 +54,7 @@ class Properties: class Property(ASTNode): last_prop_id = 0 - def __init__(self, sim, name, dtype, default, volatile, layout=Layouts.AoS): + def __init__(self, sim, name, dtype, default, volatile, layout=Layouts.AoS, reduce=False): super().__init__(sim) self.prop_id = Property.last_prop_id self.prop_name = name @@ -59,6 +62,7 @@ class Property(ASTNode): self.prop_layout = layout self.default_value = default self.volatile = volatile + self.reduce = reduce self.device_flag = False Property.last_prop_id += 1 diff --git a/src/pairs/ir/variables.py b/src/pairs/ir/variables.py index 9d36cf6a531c7e8fcf5853db45c2d1e9f449bd14..00bcc698214a3938b5ae5c3b9b6917114f596c5c 100644 --- a/src/pairs/ir/variables.py +++ b/src/pairs/ir/variables.py @@ -49,6 +49,7 @@ class Var(ASTTerm): self.mutable = True self.var_bonded_arrays = [] self.device_flag = False + self.force_read = False if temp: DeclareVariable(sim, self) @@ -104,6 +105,10 @@ class Deref(ASTTerm): def var(self): return self._var + def copy(self, deep=False): + # Terminal copies are just themselves + return self + def type(self): return self._var.type() diff --git a/src/pairs/sim/comm.py b/src/pairs/sim/comm.py index 988e71d0bcb2f325b5582c819fd010e221ab50ca..119e432bc77f8bf2dfaee3ca80129f95af6ff4b4 100644 --- a/src/pairs/sim/comm.py +++ b/src/pairs/sim/comm.py @@ -23,7 +23,7 @@ class Comm: self.nsend_all = sim.add_var('nsend_all', Types.Int32) self.send_capacity = sim.add_var('send_capacity', Types.Int32, 200000) self.recv_capacity = sim.add_var('recv_capacity', Types.Int32, 200000) - self.elem_capacity = sim.add_var('elem_capacity', Types.Int32, 40) + self.elem_capacity = sim.add_var('elem_capacity', Types.Int32, 100) self.nsend = sim.add_array('nsend', [dom_part.nranks_capacity], Types.Int32) self.send_offsets = sim.add_array('send_offsets', [dom_part.nranks_capacity], Types.Int32) self.send_buffer = sim.add_array('send_buffer', [self.send_capacity, self.elem_capacity], Types.Real, arr_sync=False) @@ -41,6 +41,13 @@ class Comm: self.contact_soffsets = sim.add_array('contact_soffsets', [dom_part.nranks_capacity], Types.Int32) self.contact_roffsets = sim.add_array('contact_roffsets', [dom_part.nranks_capacity], Types.Int32) + self.nsend_reverse = sim.add_array('nsend_reverse', [dom_part.nranks_capacity], Types.Int32) + self.send_offsets_reverse = sim.add_array('send_offsets_reverse', [dom_part.nranks_capacity], Types.Int32) + self.send_buffer_reverse = sim.add_array('send_buffer_reverse', [self.send_capacity, self.elem_capacity], Types.Real, arr_sync=False) + self.nrecv_reverse = sim.add_array('nrecv_reverse', [dom_part.nranks_capacity], Types.Int32) + self.recv_offsets_reverse = sim.add_array('recv_offsets_reverse', [dom_part.nranks_capacity], Types.Int32) + self.recv_buffer_reverse = sim.add_array('recv_buffer_reverse', [self.recv_capacity, self.elem_capacity], Types.Real, arr_sync=False) + @pairs_inline def synchronize(self): # Every property that is not constant across timesteps and have neighbor accesses during any @@ -51,6 +58,33 @@ class Comm: PackAllGhostParticles(self, prop_list) CommunicateAllData(self, prop_list) UnpackAllGhostParticles(self, prop_list) + + @pairs_host_block + def reverse_comm(self, reduce=False): + self.sim.module_name(f"reverse_comm") + self.prop_list = self.sim.properties.reduction_props() + + for step in range(self.dom_part.number_of_steps() - 1, -1, -1): + if self.sim._target.is_gpu(): + CopyArray(self.sim, self.nsend, Contexts.Host, Actions.ReadOnly) + CopyArray(self.sim, self.nrecv, Contexts.Host, Actions.ReadOnly) + CopyArray(self.sim, self.send_offsets, Contexts.Host, Actions.ReadOnly) + CopyArray(self.sim, self.recv_offsets, Contexts.Host, Actions.ReadOnly) + + # CopyArray(self.sim, self.nsend_reverse, Contexts.Host, Actions.WriteOnly) + # CopyArray(self.sim, self.nrecv_reverse, Contexts.Host, Actions.WriteOnly) + # CopyArray(self.sim, self.send_offsets_reverse, Contexts.Host, Actions.WriteOnly) + # CopyArray(self.sim, self.recv_offsets_reverse, Contexts.Host, Actions.WriteOnly) + + for j in self.dom_part.step_indexes(step): + Assign(self.sim, self.nsend_reverse[j], self.nrecv[j]) + Assign(self.sim, self.nrecv_reverse[j], self.nsend[j]) + Assign(self.sim, self.send_offsets_reverse[j], self.recv_offsets[j]) + Assign(self.sim, self.recv_offsets_reverse[j], self.send_offsets[j]) + + PackGhostParticlesReverse(self, step, self.prop_list) + CommunicateDataReverse(self, step, self.prop_list) + UnpackGhostParticlesReverse(self, step, self.prop_list, reduce) @pairs_inline def borders(self): @@ -180,7 +214,24 @@ class CommunicateData(Lowerable): self.comm.send_buffer, self.comm.send_offsets, self.comm.nsend, self.comm.recv_buffer, self.comm.recv_offsets, self.comm.nrecv]) +class CommunicateDataReverse(Lowerable): + def __init__(self, comm, step, prop_list): + super().__init__(comm.sim) + self.comm = comm + self.step = step + self.prop_list = prop_list + self.sim.add_statement(self) + @pairs_inline + def lower(self): + elem_size = sum([Types.number_of_elements(self.sim, p.type()) for p in self.prop_list]) + + Call_Void(self.sim, + "pairs_runtime->communicateDataReverse", + [self.step, elem_size, + self.comm.send_buffer_reverse, self.comm.send_offsets_reverse, self.comm.nsend_reverse, + self.comm.recv_buffer_reverse, self.comm.recv_offsets_reverse, self.comm.nrecv_reverse]) + class CommunicateContactHistoryData(Lowerable): def __init__(self, comm, step): super().__init__(comm.sim) @@ -329,6 +380,45 @@ class PackGhostParticles(Lowerable): Assign(self.sim, send_buffer[i][p_offset], cast_fn(p[m])) p_offset += 1 +class PackGhostParticlesReverse(Lowerable): + def __init__(self, comm, step, prop_list): + super().__init__(comm.sim) + self.comm = comm + self.step = step + self.prop_list = prop_list + self.sim.add_statement(self) + + def get_elems_per_particle(self): + return sum([Types.number_of_elements(self.sim, p.type()) for p in self.prop_list]) + + @pairs_device_block + def lower(self): + nlocal = self.sim.nlocal + nghost = self.sim.nghost + send_buffer_reverse = self.comm.send_buffer_reverse + send_buffer_reverse.set_stride(1, self.get_elems_per_particle()) + + self.sim.module_name(f"pack_ghost_particles_reverse{self.step}_" + "_".join([str(p.id()) for p in self.prop_list])) + + start = self.comm.send_offsets_reverse[self.comm.dom_part.first_step_index(self.step)] + end = ScalarOp.inline(start + self.comm.dom_part.reduce_sum_step_indexes(self.step, self.comm.nsend_reverse)) + for i in For(self.sim, start, end): + p_offset = 0 + m = nlocal + i + for p in self.prop_list: + if not Types.is_scalar(p.type()): + nelems = Types.number_of_elements(self.sim, p.type()) + for e in range(nelems): + src = p[m][e] + Assign(self.sim, send_buffer_reverse[i][p_offset + e], src) + + p_offset += nelems + + else: + cast_fn = lambda x: Cast(self.sim, x, Types.Real) if p.type() != Types.Real else x + Assign(self.sim, send_buffer_reverse[i][p_offset], cast_fn(p[m])) + p_offset += 1 + class UnpackGhostParticles(Lowerable): def __init__(self, comm, step, prop_list): @@ -365,6 +455,49 @@ class UnpackGhostParticles(Lowerable): Assign(self.sim, p[nlocal + i], cast_fn(recv_buffer[i][p_offset])) p_offset += 1 +class UnpackGhostParticlesReverse(Lowerable): + def __init__(self, comm, step, prop_list, reduce=False): + super().__init__(comm.sim) + self.comm = comm + self.step = step + self.prop_list = prop_list + self.reduce = reduce + self.sim.add_statement(self) + + + def get_elems_per_particle(self): + return sum([Types.number_of_elements(self.sim, p.type()) for p in self.prop_list]) + + @pairs_device_block + def lower(self): + send_map = self.comm.send_map + recv_buffer_reverse = self.comm.recv_buffer_reverse + recv_buffer_reverse.set_stride(1, self.get_elems_per_particle()) + self.sim.module_name(f"unpack_ghost_particles_reverse{self.step}_" + "_".join([str(p.id()) for p in self.prop_list])) + + start = self.comm.recv_offsets_reverse[self.comm.dom_part.first_step_index(self.step)] + end = ScalarOp.inline(start + self.comm.dom_part.reduce_sum_step_indexes(self.step, self.comm.nrecv_reverse)) + for i in For(self.sim, start, end): + p_offset = 0 + m = send_map[i] + for p in self.prop_list: + if not Types.is_scalar(p.type()): + nelems = Types.number_of_elements(self.sim, p.type()) + for e in range(nelems): + if self.reduce: + Assign(self.sim, p[m][e], p[m][e] + recv_buffer_reverse[i][p_offset + e]) + else: + Assign(self.sim, p[m][e], recv_buffer_reverse[i][p_offset + e]) + + p_offset += nelems + + else: + cast_fn = lambda x: Cast(self.sim, x, p.type()) if p.type() != Types.Real else x + if self.reduce: + Assign(self.sim, p[m], p[m] + cast_fn(recv_buffer_reverse[i][p_offset])) + else: + Assign(self.sim, p[m], cast_fn(recv_buffer_reverse[i][p_offset])) + p_offset += 1 class PackAllGhostParticles(Lowerable): def __init__(self, comm, prop_list): diff --git a/src/pairs/sim/domain_partitioning.py b/src/pairs/sim/domain_partitioning.py index e781d235fe82a5dea66e1854ed3cbe33294f801f..cf30351a6ca22b621b4f91cbc745d9e7a003cd13 100644 --- a/src/pairs/sim/domain_partitioning.py +++ b/src/pairs/sim/domain_partitioning.py @@ -87,6 +87,8 @@ class DimensionRanges: class BlockForest: def __init__(self, sim): self.sim = sim + self.reduce_step = sim.add_var('reduce_step', Types.Int32) # this var is treated as a tmp (workaround for gpu) + self.reduce_step.force_read = True self.rank = sim.add_var('rank', Types.Int32) self.nranks = sim.add_var('nranks', Types.Int32) self.nranks_capacity = sim.add_var('nranks_capacity', Types.Int32, init_value=27) @@ -108,7 +110,7 @@ class BlockForest: return 1 def step_indexes(self, step): - yield from For(self.sim, 0, self.nranks) + yield from For(self.sim, 0, self.nranks, not_kernel=True) def first_step_index(self, step): return 0 @@ -117,11 +119,11 @@ class BlockForest: return self.reduce_sum_step_indexes(0, array) def reduce_sum_step_indexes(self, step, array): - nsend_sum = self.sim.add_temp_var(0) - for i in For(self.sim, 0, self.nranks): - Assign(self.sim, nsend_sum, nsend_sum + array[i]) - - return nsend_sum + Assign(self.sim, self.reduce_step, 0) + for i in For(self.sim, 0, self.nranks, not_kernel=True): + Assign(self.sim, self.reduce_step, ScalarOp.inline( self.reduce_step + array[i])) + + return self.reduce_step def initialize(self): grid_array = [(self.sim.grid.min(d), self.sim.grid.max(d)) for d in range(self.sim.ndims())] @@ -131,23 +133,25 @@ class BlockForest: Call_Void(self.sim, "pairs_runtime->updateDomain", []) Assign(self.sim, self.rank, Call_Int(self.sim, "pairs_runtime->getDomainPartitioner()->getRank", [])) Assign(self.sim, self.nranks, Call_Int(self.sim, "pairs_runtime->getNumberOfNeighborRanks", [])) - Assign(self.sim, self.ntotal_aabbs, Call_Int(self.sim, "pairs_runtime->getNumberOfNeighborAABBs", [])) - - for _ in Filter(self.sim, self.nranks_capacity < self.nranks): - Assign(self.sim, self.nranks_capacity, self.nranks + 10) - self.ranks.realloc() - self.naabbs.realloc() - self.aabb_offsets.realloc() - - for _ in Filter(self.sim, self.aabb_capacity < self.ntotal_aabbs): - Assign(self.sim, self.aabb_capacity, self.ntotal_aabbs + 20) - self.aabbs.realloc() - - Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['ranks', self.ranks, self.nranks]) - Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['naabbs', self.naabbs, self.nranks]) - Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['aabb_offsets', self.aabb_offsets, self.nranks]) - Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['aabbs', self.aabbs, self.ntotal_aabbs * 6]) - Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['subdom', self.subdom, self.sim.ndims() * 2]) + + for _ in Filter(self.sim, ScalarOp.neq(self.nranks, 0)): + Assign(self.sim, self.ntotal_aabbs, Call_Int(self.sim, "pairs_runtime->getNumberOfNeighborAABBs", [])) + + for _ in Filter(self.sim, self.nranks_capacity < self.nranks): + Assign(self.sim, self.nranks_capacity, self.nranks + 10) + self.ranks.realloc() + self.naabbs.realloc() + self.aabb_offsets.realloc() + + for _ in Filter(self.sim, self.aabb_capacity < self.ntotal_aabbs): + Assign(self.sim, self.aabb_capacity, self.ntotal_aabbs + 20) + self.aabbs.realloc() + + Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['ranks', self.ranks, self.nranks]) + Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['naabbs', self.naabbs, self.nranks]) + Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['aabb_offsets', self.aabb_offsets, self.nranks]) + Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['aabbs', self.aabbs, self.ntotal_aabbs * 6]) + Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['subdom', self.subdom, self.sim.ndims() * 2]) def ghost_particles(self, step, position, offset=0.0): ''' TODO : If we have pbc, a sinlge particle can be a ghost particle multiple times (at different locations) for the same neighbor block, diff --git a/src/pairs/sim/simulation.py b/src/pairs/sim/simulation.py index 15332c4fab3fa8ffd53826c8f08cde38a491805f..c23c377c3b62e4bb83494037df47e9b39411ba5d 100644 --- a/src/pairs/sim/simulation.py +++ b/src/pairs/sim/simulation.py @@ -200,9 +200,9 @@ class Simulation: assert len(pbc_config) == self.dims, "PBC must be specified for each dimension." self._pbc = pbc_config - def add_property(self, prop_name, prop_type, value=0.0, volatile=False): + def add_property(self, prop_name, prop_type, value=0.0, volatile=False, reduce=False): assert self.property(prop_name) is None, f"Property already defined: {prop_name}" - return self.properties.add(prop_name, prop_type, value, volatile) + return self.properties.add(prop_name, prop_type, value, volatile, p_reduce=reduce) def add_position(self, prop_name, value=[0.0, 0.0, 0.0], volatile=False, layout=Layouts.AoS): assert self.property(prop_name) is None, f"Property already defined: {prop_name}" @@ -442,6 +442,8 @@ class Simulation: # Initialize communication instance with specified domain-partitioner comm = Comm(self, self._dom_part) + reverse_comm_module = comm.reverse_comm(reduce=True) + # Params that determine when a method must be called only when reneighboring every_reneighbor_params = {'every': self.reneighbor_frequency} @@ -473,7 +475,14 @@ class Simulation: timestep_procedures.append(ResetContactHistoryUsageStatus(self, self._contact_history)) # Reset volatile properties and add computational kernels - timestep_procedures += [ResetVolatileProperties(self)] + self.functions + timestep_procedures += [ResetVolatileProperties(self)] + + # add computational kernels + timestep_procedures += self.functions + + # For whole-program-generation, add reverse_comm wherever needed in the timestep loop (eg: after computational kernels) like this: + if self._generate_whole_program: + timestep_procedures += [reverse_comm_module] # Clear unused contact history if self._use_contact_history: @@ -543,12 +552,12 @@ class Simulation: setup_sim_module = Module(self, name='setup_sim', block=setup_sim) do_timestep_module = Module(self, name='do_timestep', block=timestep.as_block()) - modules_list = [initialize_module, create_domain_module, setup_sim_module, do_timestep_module] + modules_list = [initialize_module, create_domain_module, setup_sim_module, do_timestep_module, reverse_comm_module] transformations = Transformations(modules_list, self._target) transformations.apply_all() # Generate library - self.code_gen.generate_library(initialize_module, create_domain_module, setup_sim_module, do_timestep_module) + self.code_gen.generate_library(initialize_module, create_domain_module, setup_sim_module, do_timestep_module, reverse_comm_module) self.code_gen.generate_interfaces()