diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp index 1768b512ac2a588c40a1433377ce86dd7e289697..0b02c55b5b403b54ef4380aaf6ae50807fb27664 100644 --- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp +++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp @@ -175,7 +175,7 @@ int main(int argc, char** argv) auto communication = std::make_shared< NonUniformGPUScheme< CommunicationStencil_T > >(blocks, gpuEnabledMPI); auto packInfo = lbm_generated::setupNonuniformGPUPdfCommunication< GPUPdfField_T >(blocks, pdfFieldGpuID); communication->addPackInfo(packInfo); - communication->useStreams(asyncCommunication); + if(asyncCommunication) { communication->activateGPUStreams(); } WALBERLA_MPI_BARRIER() ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -191,12 +191,7 @@ int main(int argc, char** argv) LBMMeshRefinement(blocks, pdfFieldGpuID, sweepCollection, boundaryCollection, communication, packInfo); SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps); - if(asyncCommunication){ - LBMMeshRefinement.addRefinementWithStreamsToTimeLoop(timeLoop); - } - else{ - LBMMeshRefinement.addRefinementToTimeLoop(timeLoop); - } + LBMMeshRefinement.addRefinementToTimeLoop(timeLoop, uint_c(0), asyncCommunication); // VTK const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0); diff --git a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py index 71d5dcbf5a155b1e6003db507457fb3763ae4182..0f122fbe3055b00eb47e0b702862ec49e4f024a7 100644 --- a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py @@ -162,7 +162,7 @@ def scaling(num_proc, gpu_enabled_mpi=False, uniform=True): root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)]) scenarios = wlb.ScenarioManager() - for async_communication in [True, False]: + for async_communication in [True, ]: scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}", domain_size=domain_size, root_blocks=root_blocks, diff --git a/src/gpu/communication/NonUniformGPUScheme.h b/src/gpu/communication/NonUniformGPUScheme.h index fd5aa83f32c0357d67d5d9540f5761cec6f83067..46006910ea0d067172876e149341470e90c231ab 100644 --- a/src/gpu/communication/NonUniformGPUScheme.h +++ b/src/gpu/communication/NonUniformGPUScheme.h @@ -65,18 +65,16 @@ class NonUniformGPUScheme /*! \name Pack Info Registration */ //@{ void addPackInfo(const shared_ptr< GeneratedNonUniformGPUPackInfo >& pi); - void useStreams(const bool useStreams) + void activateGPUStreams() { - useStreams_ = useStreams; - if (useStreams_) + useStreams_ = true; + WALBERLA_LOG_INFO_ON_ROOT("Using " << nStreams_ << " GPU streams per level for communication kernels") + for (uint_t i = 0; i < 3; ++i) { - for (uint_t i = 0; i < 3; ++i) + for (uint_t j = 0; j < nStreams_; ++j) { - for (uint_t j = 0; j < Stencil::Q; ++j) - { - WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i][j])) - WALBERLA_GPU_CHECK(gpuStreamCreate(&localStreams_[i][j])) - } + WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i][j])) + WALBERLA_GPU_CHECK(gpuStreamCreate(&localStreams_[i][j])) } } } @@ -167,6 +165,7 @@ class NonUniformGPUScheme std::vector<std::vector< gpuStream_t >> streams_; std::vector<std::vector< gpuStream_t >> localStreams_; + uint_t nStreams_{uint_c(6)}; }; template< typename Stencil > @@ -224,9 +223,9 @@ void NonUniformGPUScheme< Stencil >::init() for (uint_t i = 0; i < 3; ++i) { - streams_[i].resize(Stencil::Q); - localStreams_[i].resize(Stencil::Q); - for (uint_t j = 0; j < Stencil::Q; ++j) + streams_[i].resize(nStreams_); + localStreams_[i].resize(nStreams_); + for (uint_t j = 0; j < nStreams_; ++j) { streams_[i][j] = nullptr; localStreams_[i][j] = nullptr; @@ -384,7 +383,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationEqualLevel(const uint_t i auto receiverBlock = dynamic_cast< Block * >( forest->getBlock( senderBlock->getNeighborId( neighborIdx, uint_t(0) )) ); for (auto& pi : packInfos_) { - pi->communicateLocalEqualLevel(senderBlock, receiverBlock, *dir, localStreams_[EQUAL_LEVEL][*dir]); + pi->communicateLocalEqualLevel(senderBlock, receiverBlock, *dir, localStreams_[EQUAL_LEVEL][*dir % nStreams_]); } } else @@ -398,17 +397,17 @@ void NonUniformGPUScheme< Stencil >::startCommunicationEqualLevel(const uint_t i WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeEqualLevelSend(senderBlock, *dir)) if(sendFromGPU_) { - pi->packDataEqualLevel(senderBlock, *dir, gpuDataBuffer, streams_[EQUAL_LEVEL][*dir]); + pi->packDataEqualLevel(senderBlock, *dir, gpuDataBuffer, streams_[EQUAL_LEVEL][*dir % nStreams_]); } else { auto gpuDataPtr = gpuDataBuffer.cur(); // packDataEqualLevel moves the pointer with advanceNoResize - pi->packDataEqualLevel(senderBlock, *dir, gpuDataBuffer, streams_[EQUAL_LEVEL][*dir]); + pi->packDataEqualLevel(senderBlock, *dir, gpuDataBuffer, streams_[EQUAL_LEVEL][*dir % nStreams_]); auto size = pi->sizeEqualLevelSend(senderBlock, *dir); auto cpuDataPtr = bufferSystemCPU_[EQUAL_LEVEL][index].sendBuffer(nProcess).advanceNoResize(size); WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) - WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[EQUAL_LEVEL][*dir])) + WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[EQUAL_LEVEL][*dir % nStreams_])) } } } @@ -417,7 +416,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationEqualLevel(const uint_t i // wait for packing to finish if(useStreams_) { - for (uint_t i = 0; i < Stencil::Q; ++i) + for (uint_t i = 0; i < nStreams_; ++i) { WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[EQUAL_LEVEL][i])) } @@ -480,7 +479,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t auto fineReceiverBlock = dynamic_cast< Block * >( forest->getBlock( fineReceiverId ) ); for (auto& pi : packInfos_) { - pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, localStreams_[COARSE_TO_FINE][*dir]); + pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, localStreams_[COARSE_TO_FINE][*dir % nStreams_]); } } else @@ -493,7 +492,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir)) if (sendFromGPU_) { - pi->packDataCoarseToFine(coarseBlock, fineReceiverId, *dir, gpuDataBuffer, streams_[COARSE_TO_FINE][*dir]); + pi->packDataCoarseToFine(coarseBlock, fineReceiverId, *dir, gpuDataBuffer, streams_[COARSE_TO_FINE][*dir % nStreams_]); } else { @@ -514,7 +513,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t // wait for packing to finish if(useStreams_) { - for (uint_t i = 0; i < Stencil::Q; ++i) + for (uint_t i = 0; i < nStreams_; ++i) { WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[COARSE_TO_FINE][i])) } @@ -576,7 +575,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t auto coarseReceiverBlock = dynamic_cast< Block * >( forest->getBlock( coarseReceiverId ) ); for (auto& pi : packInfos_) { - pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, localStreams_[FINE_TO_COARSE][*dir]); + pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, localStreams_[FINE_TO_COARSE][*dir % nStreams_]); } } else @@ -589,7 +588,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeFineToCoarseSend(fineBlock, *dir)) if (sendFromGPU_) { - pi->packDataFineToCoarse(fineBlock, coarseReceiverId, *dir, gpuDataBuffer, streams_[FINE_TO_COARSE][*dir]); + pi->packDataFineToCoarse(fineBlock, coarseReceiverId, *dir, gpuDataBuffer, streams_[FINE_TO_COARSE][*dir % nStreams_]); } else { @@ -609,7 +608,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t // wait for packing to finish if(useStreams_) { - for (uint_t i = 0; i < Stencil::Q; ++i) + for (uint_t i = 0; i < nStreams_; ++i) { WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[FINE_TO_COARSE][i])) } @@ -647,7 +646,7 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateEqualLevel(const uint_t leve { GpuBuffer_T& gpuDataBuffer = recvInfo.buffer(); WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) - pi->unpackDataEqualLevel(block, stencil::inverseDir[header.dir], gpuDataBuffer, streams_[EQUAL_LEVEL][header.dir]); + pi->unpackDataEqualLevel(block, stencil::inverseDir[header.dir], gpuDataBuffer, streams_[EQUAL_LEVEL][header.dir % nStreams_]); } } } @@ -673,15 +672,15 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateEqualLevel(const uint_t leve WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr) - WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[EQUAL_LEVEL][header.dir])) - pi->unpackDataEqualLevel(block, stencil::inverseDir[header.dir], gpuBuffer, streams_[EQUAL_LEVEL][header.dir]); + WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[EQUAL_LEVEL][header.dir % nStreams_])) + pi->unpackDataEqualLevel(block, stencil::inverseDir[header.dir], gpuBuffer, streams_[EQUAL_LEVEL][header.dir % nStreams_]); } } } } if(useStreams_) { - for (uint_t i = 0; i < Stencil::Q; ++i) + for (uint_t i = 0; i < nStreams_; ++i) { WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[EQUAL_LEVEL][i])) WALBERLA_GPU_CHECK(gpuStreamSynchronize(localStreams_[EQUAL_LEVEL][i])) @@ -712,7 +711,7 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateCoarseToFine(const uint_t fi GpuBuffer_T &gpuDataBuffer = recvInfo.buffer(); WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) pi->unpackDataCoarseToFine(fineReceiver, header.senderId, stencil::inverseDir[header.dir], - gpuDataBuffer, streams_[COARSE_TO_FINE][header.dir]); + gpuDataBuffer, streams_[COARSE_TO_FINE][header.dir % nStreams_]); } } } @@ -736,15 +735,15 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateCoarseToFine(const uint_t fi WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr) - WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[COARSE_TO_FINE][header.dir])) - pi->unpackDataCoarseToFine(fineReceiver, header.senderId, stencil::inverseDir[header.dir], adaptiveGPUBuffer, streams_[COARSE_TO_FINE][header.dir]); + WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[COARSE_TO_FINE][header.dir % nStreams_])) + pi->unpackDataCoarseToFine(fineReceiver, header.senderId, stencil::inverseDir[header.dir], adaptiveGPUBuffer, streams_[COARSE_TO_FINE][header.dir % nStreams_]); } } } } if(useStreams_) { - for (uint_t i = 0; i < Stencil::Q; ++i) + for (uint_t i = 0; i < nStreams_; ++i) { WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[COARSE_TO_FINE][i])) WALBERLA_GPU_CHECK(gpuStreamSynchronize(localStreams_[COARSE_TO_FINE][i])) @@ -778,7 +777,7 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateFineToCoarse(const uint_t fi { GpuBuffer_T& gpuDataBuffer = recvInfo.buffer(); WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) - pi->unpackDataFineToCoarse(block, header.senderId, stencil::inverseDir[header.dir], gpuDataBuffer, streams_[FINE_TO_COARSE][header.dir]); + pi->unpackDataFineToCoarse(block, header.senderId, stencil::inverseDir[header.dir], gpuDataBuffer, streams_[FINE_TO_COARSE][header.dir % nStreams_]); } } } @@ -802,8 +801,8 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateFineToCoarse(const uint_t fi WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr) - WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[FINE_TO_COARSE][header.dir])) - pi->unpackDataFineToCoarse(block, header.senderId, stencil::inverseDir[header.dir], adaptiveGPUBuffer, streams_[FINE_TO_COARSE][header.dir]); + WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[FINE_TO_COARSE][header.dir % nStreams_])) + pi->unpackDataFineToCoarse(block, header.senderId, stencil::inverseDir[header.dir], adaptiveGPUBuffer, streams_[FINE_TO_COARSE][header.dir % nStreams_]); } } } @@ -811,7 +810,7 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateFineToCoarse(const uint_t fi if(useStreams_) { - for (uint_t i = 0; i < Stencil::Q; ++i) + for (uint_t i = 0; i < nStreams_; ++i) { WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[FINE_TO_COARSE][i])) WALBERLA_GPU_CHECK(gpuStreamSynchronize(localStreams_[FINE_TO_COARSE][i])) @@ -1017,7 +1016,7 @@ NonUniformGPUScheme< Stencil >::~NonUniformGPUScheme() { for (uint_t i = 0; i < 3; ++i) { - for (uint_t j = 0; j < Stencil::Q; ++j) + for (uint_t j = 0; j < nStreams_; ++j) { WALBERLA_ASSERT_NOT_NULLPTR(streams_[i][j]) WALBERLA_ASSERT_NOT_NULLPTR(localStreams_[i][j]) diff --git a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h index f4a49e4b52733a7a2f96834df1f4df9b15a4d48c..9976afb0e056e894bb9c96a471e094fcbad5759c 100644 --- a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h +++ b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h @@ -77,11 +77,11 @@ class BasicRecursiveTimeStepGPU std::vector< Block* > blocks; sbfs->getBlocks(blocks, level); blocks_.push_back(blocks); - streams_[level].resize(blocks.size()); + streams_[level].resize(nStreams_); } for (uint_t level = 0; level <= maxLevel_; level++) { - for (uint_t i = 0; i < blocks_[level].size(); i++) + for (uint_t i = 0; i < nStreams_; i++) { streams_[level][i] = nullptr; } @@ -93,7 +93,7 @@ class BasicRecursiveTimeStepGPU if(useStreams_){ for (uint_t level = 0; level <= maxLevel_; level++) { - for (uint_t i = 0; i < blocks_[level].size(); i++) + for (uint_t i = 0; i < nStreams_; i++) { WALBERLA_GPU_CHECK(gpuStreamDestroy(streams_[level][i])) } @@ -102,10 +102,10 @@ class BasicRecursiveTimeStepGPU void activateStreams() { - WALBERLA_LOG_INFO_ON_ROOT("Using asynchronous communication with GPU streams") + WALBERLA_LOG_INFO_ON_ROOT("Updating blocks using " << nStreams_ << " GPU Streams") for (uint_t level = 0; level <= maxLevel_; level++) { - for (uint_t i = 0; i < blocks_[level].size(); i++) + for (uint_t i = 0; i < nStreams_; i++) { WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[level][i])) } @@ -114,8 +114,8 @@ class BasicRecursiveTimeStepGPU } void operator()() { timestep(0); }; - void addRefinementToTimeLoop(SweepTimeloop& timeloop, uint_t level = 0); - void addRefinementWithStreamsToTimeLoop(SweepTimeloop& timeloop, uint_t level = 0); + void addRefinementToTimeLoop(SweepTimeloop& timeloop, uint_t level); + void addRefinementToTimeLoop(SweepTimeloop& timeloop, uint_t level, bool utiliseGPUStreams); inline void addPostBoundaryHandlingBlockFunction( const BlockFunction & function ); void waitAllCommunications(){commScheme_->waitAllCommunications();}; @@ -126,12 +126,7 @@ class BasicRecursiveTimeStepGPU std::function< void() > executeBoundaryHandlingOnLevel(uint_t level); std::function< void() > executePostBoundaryBlockFunctions(uint_t level); - std::function< void() > executeStreamCollideInnerOnLevel(uint_t level); - std::function< void() > executeStreamCollideOuterOnLevel(uint_t level); - std::function< void() > executeGhostLayerPropagationOnLevel(uint_t level); - - void synchronousStep(SweepTimeloop & timeloop, uint_t level); - void asynchronousStep(SweepTimeloop & timeloop, uint_t level); + void streamCollideWithCommunication(SweepTimeloop & timeloop, uint_t level, bool ghostLayerPropagation); std::function<void()> syncLevel(uint_t level); std::shared_ptr< StructuredBlockForest > sbfs_; @@ -147,6 +142,7 @@ class BasicRecursiveTimeStepGPU std::vector< BlockFunction > globalPostBoundaryHandlingBlockFunctions_; std::vector< std::vector< gpuStream_t >> streams_; + uint_t nStreams_{uint_c(6)}; bool useStreams_; }; diff --git a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h index 576a385bc0196b40b138b9a9e289372596f81032..910811e277b3ed422095c3c871e5a28e6ca20c09 100644 --- a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h +++ b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h @@ -145,28 +145,30 @@ void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollectio } template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > -void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::addRefinementWithStreamsToTimeLoop(SweepTimeloop & timeloop, uint_t level) +void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::addRefinementToTimeLoop(SweepTimeloop & timeloop, uint_t level, bool utiliseGPUStreams) { - if(!useStreams_) { activateStreams(); } + if(utiliseGPUStreams && !useStreams_) { activateStreams(); } // 1.1 Collision - synchronousStep(timeloop, level); + streamCollideWithCommunication(timeloop, level, false); // 1.2 Recursive Descent if(level < maxLevel_){ - addRefinementWithStreamsToTimeLoop(timeloop, level + 1); + addRefinementToTimeLoop(timeloop, level + 1, utiliseGPUStreams); } // 1.3 Coarse to Fine Communication, receiving end if(level != 0){ - timeloop.addFuncBeforeTimeStep(commScheme_->startCommunicateCoarseToFineFunctor(level), "Refinement Cycle: start CF on level " + std::to_string(level)); + timeloop.addFuncBeforeTimeStep(commScheme_->communicateCoarseToFineFunctor(level), "Refinement Cycle: CF on level " + std::to_string(level)); } + // 1.4 Equal-Level Communication + timeloop.addFuncBeforeTimeStep(commScheme_->startCommunicateEqualLevelFunctor(level), "Refinement Cycle: start EC on level " + std::to_string(level)); + // 1.5 Boundary Handling and Coalescence Preparation timeloop.addFuncBeforeTimeStep(executeBoundaryHandlingOnLevel(level), "Refinement Cycle: BC on level " + std::to_string(level)); if(level == 0) timeloop.addFuncBeforeTimeStep(executePostBoundaryBlockFunctions(level), "Refinement Cycle: post BC block functions on level " + std::to_string(level)); - // 1.6 Fine to Coarse Communication, receiving end if(level < maxLevel_){ timeloop.addFuncBeforeTimeStep(commScheme_->startCommunicateFineToCoarseFunctor(level + 1), "Refinement Cycle: start FC on level " + std::to_string(level + 1)); @@ -177,14 +179,13 @@ void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollectio if(level == 0) return; // 2.1 Collision and Ghost-Layer Propagation - asynchronousStep(timeloop, level); + streamCollideWithCommunication(timeloop, level, true); // 2.2 Recursive Descent if(level < maxLevel_) - addRefinementWithStreamsToTimeLoop(timeloop, level + 1); - + addRefinementToTimeLoop(timeloop, level + 1, utiliseGPUStreams); // 2.4 Equal-Level Communication - + timeloop.addFuncBeforeTimeStep(commScheme_->startCommunicateEqualLevelFunctor(level), "Refinement Cycle: start EC on level " + std::to_string(level)); // 2.5 Boundary Handling and Coalescence Preparation timeloop.addFuncBeforeTimeStep(executeBoundaryHandlingOnLevel(level), "Refinement Cycle: BC on level " + std::to_string(level)); @@ -206,64 +207,28 @@ std::function<void()> BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, { for (uint_t i = 0; i < blocks_[level].size(); i++) { - ghostLayerPropagation(blocks_[level][i], streams_[level][i]); - sweepCollection_.streamCollide(blocks_[level][i], 0, streams_[level][i]); + ghostLayerPropagation(blocks_[level][i], streams_[level][i % nStreams_]); + sweepCollection_.streamCollide(blocks_[level][i], 0, streams_[level][i % nStreams_]); } } else { for (uint_t i = 0; i < blocks_[level].size(); i++) { - sweepCollection_.streamCollide(blocks_[level][i], 0, streams_[level][i]); + sweepCollection_.streamCollide(blocks_[level][i], 0, streams_[level][i % nStreams_]); } } }; } -template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > -std::function<void()> BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::executeStreamCollideInnerOnLevel(uint_t level) -{ - return [level, this]() - { - for (uint_t i = 0; i < blocks_[level].size(); i++) - { - sweepCollection_.streamCollideInner(blocks_[level][i], streams_[level][i]); - } - }; -} - -template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > -std::function<void()> BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::executeStreamCollideOuterOnLevel(uint_t level) -{ - return [level, this]() - { - for (uint_t i = 0; i < blocks_[level].size(); i++) - { - sweepCollection_.streamCollideOuter(blocks_[level][i], streams_[level][i]); - } - }; -} - -template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > -std::function<void()> BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::executeGhostLayerPropagationOnLevel(uint_t level) -{ - return [level, this]() - { - for (uint_t i = 0; i < blocks_[level].size(); i++) - { - ghostLayerPropagation(blocks_[level][i], streams_[level][i]); - } - }; -} - template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > std::function<void()> BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::executeBoundaryHandlingOnLevel(uint_t level) { return [this, level]() { for (uint_t i = 0; i < blocks_[level].size(); i++) { - boundaryCollection_(blocks_[level][i], streams_[level][i]); - if (level != maxLevel_) pdfFieldPackInfo_->prepareCoalescence(blocks_[level][i], streams_[level][i]); + boundaryCollection_(blocks_[level][i], streams_[level][i % nStreams_]); + if (level != maxLevel_) pdfFieldPackInfo_->prepareCoalescence(blocks_[level][i], streams_[level][i % nStreams_]); } }; } @@ -290,39 +255,24 @@ std::function<void()> BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, { for (uint_t i = 0; i < blocks_[level].size(); i++) { - WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[level][i])) + WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[level][i % nStreams_])) } } }; } template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > -void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::synchronousStep(SweepTimeloop & timeloop, uint_t level) +void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::streamCollideWithCommunication(SweepTimeloop & timeloop, uint_t level, bool ghostLayerPropagation) { if(level < maxLevel_) { timeloop.addFuncBeforeTimeStep(commScheme_->waitCommunicateFineToCoarseFunctor(level + 1), "Refinement Cycle: wait FC on level " + std::to_string(level + 1)); } - timeloop.addFuncBeforeTimeStep(commScheme_->waitCommunicateCoarseToFineFunctor(level), "Refinement Cycle: wait CF on level " + std::to_string(level)); + // timeloop.addFuncBeforeTimeStep(commScheme_->waitCommunicateCoarseToFineFunctor(level), "Refinement Cycle: wait CF on level " + std::to_string(level)); timeloop.addFuncBeforeTimeStep(commScheme_->waitCommunicateEqualLevelFunctor(level), "Refinement Cycle: wait EC on level " + std::to_string(level)); - timeloop.addFuncBeforeTimeStep(executeStreamCollideOnLevel(level, false), "Refinement Cycle: stream Collide " + std::to_string(level)); + timeloop.addFuncBeforeTimeStep(executeStreamCollideOnLevel(level, ghostLayerPropagation), "Refinement Cycle: stream Collide " + std::to_string(level)); timeloop.addFuncBeforeTimeStep(syncLevel(level), "Refinement Cycle: Sync " + std::to_string(level)); - timeloop.addFuncBeforeTimeStep(commScheme_->startCommunicateEqualLevelFunctor(level), "Refinement Cycle: start EC on level " + std::to_string(level)); -} - -template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > -void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::asynchronousStep(SweepTimeloop & timeloop, uint_t level) -{ - if(level < maxLevel_) - { - timeloop.addFuncBeforeTimeStep(commScheme_->waitCommunicateFineToCoarseFunctor(level + 1), "Refinement Cycle: wait FC on level " + std::to_string(level + 1)); - } - timeloop.addFuncBeforeTimeStep(commScheme_->waitCommunicateCoarseToFineFunctor(level), "Refinement Cycle: wait CF on level " + std::to_string(level)); - timeloop.addFuncBeforeTimeStep(commScheme_->waitCommunicateEqualLevelFunctor(level), "Refinement Cycle: wait EC on level " + std::to_string(level)); - timeloop.addFuncBeforeTimeStep(executeStreamCollideOnLevel(level, true), "Refinement Cycle: stream Collide with ghostLayerPropagation " + std::to_string(level)); - timeloop.addFuncBeforeTimeStep(syncLevel(level), "Refinement Cycle: Sync " + std::to_string(level)); - timeloop.addFuncBeforeTimeStep(commScheme_->startCommunicateEqualLevelFunctor(level), "Refinement Cycle: start EC on level " + std::to_string(level)); }