diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt index e6799acb7b4247371d9979f175b3064b873e1f43..c5b09184bacb697491ffea3b1782ad84cbe3efd9 100644 --- a/apps/benchmarks/CMakeLists.txt +++ b/apps/benchmarks/CMakeLists.txt @@ -29,9 +29,8 @@ if ( WALBERLA_BUILD_WITH_PYTHON ) endif() if(WALBERLA_BUILD_WITH_GPU_SUPPORT) - add_subdirectory(CommunicationGPU) - if ( WALBERLA_BUILD_WITH_CODEGEN ) + add_subdirectory( CommunicationGPU ) add_subdirectory( UniformGridGPU ) add_subdirectory( NonUniformGridGPU ) endif() diff --git a/apps/benchmarks/CommunicationGPU/CMakeLists.txt b/apps/benchmarks/CommunicationGPU/CMakeLists.txt index 229e0a4a4164f5c5480c161dfb10fa134493c00a..d45a2b7967cd83d740eadea77faa18fdd83fbd3e 100644 --- a/apps/benchmarks/CommunicationGPU/CMakeLists.txt +++ b/apps/benchmarks/CommunicationGPU/CMakeLists.txt @@ -3,5 +3,18 @@ waLBerla_link_files_to_builddir( "*.dat" ) waLBerla_link_files_to_builddir( "*.py" ) -waLBerla_add_executable ( NAME CommunicationGPU - DEPENDS blockforest core domain_decomposition field gpu postprocessing sqlite python_coupling ) +waLBerla_generate_target_from_python(NAME CommunicationGPUGenerated + FILE CommunicationGPUCodeGen.py + OUT_FILES StorageSpec.h StorageSpec.${CODEGEN_FILE_SUFFIX} + CommunicationGPUInfoHeader.h + ) + +waLBerla_add_executable ( + NAME CommunicationGPU + FILES CommunicationGPU.cpp + DEPENDS blockforest core domain_decomposition field gpu postprocessing sqlite python_coupling CommunicationGPUGenerated ) + +waLBerla_add_executable ( + NAME GPUPackPerformance + FILES GPUPackPerformance.cpp + DEPENDS blockforest core domain_decomposition field gpu postprocessing sqlite python_coupling CommunicationGPUGenerated ) diff --git a/apps/benchmarks/CommunicationGPU/CommunicationGPU.cpp b/apps/benchmarks/CommunicationGPU/CommunicationGPU.cpp index 32a504072d5e776ac9e0a89bd1669a636cb6416d..1cee36f1e76fa563b333b332f5b93497ede14b86 100644 --- a/apps/benchmarks/CommunicationGPU/CommunicationGPU.cpp +++ b/apps/benchmarks/CommunicationGPU/CommunicationGPU.cpp @@ -28,33 +28,34 @@ #include "core/mpi/all.h" #include "core/waLBerlaBuildInfo.h" -#include "field/AddToStorage.h" - -#include "gpu/AddGPUFieldToStorage.h" #include "gpu/DeviceSelectMPI.h" #include "gpu/FieldCopy.h" -#include "gpu/GPUField.h" #include "gpu/GPUWrapper.h" #include "gpu/HostFieldAllocator.h" #include "gpu/ParallelStreams.h" #include "gpu/communication/UniformGPUScheme.h" #include "gpu/communication/MemcpyPackInfo.h" +#include "lbm_generated/field/AddToStorage.h" +#include "lbm_generated/field/PdfField.h" +#include "lbm_generated/gpu/AddToStorage.h" +#include "lbm_generated/gpu/GPUPdfField.h" +#include "lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h" + #include "python_coupling/CreateConfig.h" #include "python_coupling/PythonCallback.h" -#include "stencil/D3Q27.h" - #include "sqlite/SQLite.h" -#include <cmath> +#include "CommunicationGPUInfoHeader.h" using namespace walberla; using gpu::communication::UniformGPUScheme; -using Field_T = field::GhostLayerField<real_t, 1>; -using GPUField_T = gpu::GPUField<real_t>; +using StorageSpecification_T = lbm::StorageSpec; +using Stencil_T = StorageSpecification_T::Stencil; -using Stencil_T = stencil::D3Q27; +using Field_T = lbm_generated::PdfField< StorageSpecification_T >; +using GPUField_T = lbm_generated::GPUPdfField< StorageSpecification_T >; std::string fromEnv(const char *envVar) { auto env = std::getenv(envVar); @@ -95,6 +96,7 @@ int main(int argc, char **argv) { const real_t timeForBenchmark = runCfg.getParameter<real_t>("timeForBenchmark", real_t(-1.0)); const uint_t outerIterations = runCfg.getParameter<uint_t>("outerIterations", 2); + const std::string packInfoName = runCfg.getParameter<std::string>("pachinfo", "UniformGeneratedGPUPdfPackInfo"); field::Layout layout; if (layoutStr == "fzyx") @@ -127,12 +129,10 @@ int main(int argc, char **argv) { false // keepGlobalBlockInformation ); - auto rank = mpiManager->rank(); - + const StorageSpecification_T StorageSpec = StorageSpecification_T(); auto allocator = make_shared<gpu::HostFieldAllocator<real_t> >(); // use pinned memory allocator for faster CPU-GPU memory transfers - const BlockDataID fieldCPU = field::addToStorage<Field_T>(blocks, "field", real_c(rank), layout, uint_c(1), - allocator); - const BlockDataID fieldGPU = gpu::addGPUFieldToStorage<Field_T>(blocks, fieldCPU, "field GPU", true); + const BlockDataID fieldCPU = lbm_generated::addPdfFieldToStorage(blocks, "field", StorageSpec, uint_c(1), layout, allocator); + const BlockDataID fieldGPU = lbm_generated::addGPUPdfFieldToStorage< Field_T >(blocks, fieldCPU, StorageSpec, "field GPU", true); gpu::fieldCpy<GPUField_T, Field_T>(blocks, fieldGPU, fieldCPU); @@ -141,14 +141,40 @@ int main(int argc, char **argv) { /// COMMUNICATION SCHEME /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + std::shared_ptr<gpu::GeneratedGPUPackInfo> packInfo; + if(packInfoName == "MemcpyPackInfo") + { + packInfo = std::make_shared<gpu::communication::MemcpyPackInfo<GPUField_T >>(fieldGPU); + } + else if (packInfoName == "UniformGeneratedGPUPdfPackInfo") + { + packInfo = std::make_shared<lbm_generated::UniformGeneratedGPUPdfPackInfo< GPUField_T >>(fieldGPU); + } + else + { + WALBERLA_ABORT("packInfoName does not exist") + } + + UniformGPUScheme<Stencil_T> communication(blocks, gpuDirectComm); - auto packInfo = std::make_shared<gpu::communication::MemcpyPackInfo<GPUField_T >>(fieldGPU); + communication.addPackInfo(packInfo); - auto communicate = communication.getCommunicateFunctor(); auto commStart = communication.getStartCommunicateFunctor(); auto commWait = communication.getWaitFunctor(); + + WALBERLA_ROOT_SECTION() + { + for (auto &block : *blocks) + { + for( auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir ) + { + WALBERLA_LOG_INFO("For direction: " << stencil::dirToString[*dir] << " " << packInfo->size(*dir, &block) << " bytes will be communicated") + } + } + } + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// TIME STEP DEFINITIONS /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -178,30 +204,30 @@ int main(int argc, char **argv) { mpi::broadcastObject(iterations); - WcTimingPool timingPool; + auto timingPool = std::make_shared<WcTimingPool>(); + communication.enableTiming(timingPool); WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) - WALBERLA_LOG_INFO_ON_ROOT("Running " << outerIterations << " outer iterations of size " << iterations); + WALBERLA_LOG_INFO_ON_ROOT("Running " << outerIterations << " outer iterations of size " << iterations) + for (uint_t outerCtr = 0; outerCtr < outerIterations; ++outerCtr) { - timingPool["totalTime"].start(); for (uint_t ctr = 0; ctr < iterations; ++ctr) { - timingPool["commStart"].start(); + timingPool->operator[]("commStart").start(); commStart(); WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) - timingPool["commStart"].end(); + timingPool->operator[]("commStart").end(); - timingPool["commWait"].start(); + timingPool->operator[]("commWait").start(); commWait(); WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) - timingPool["commWait"].end(); + timingPool->operator[]("commWait").end(); } - timingPool["totalTime"].end(); } WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) - auto reducedTimingPool = timingPool.getReduced(timing::REDUCE_TOTAL, 0); + auto reducedTimingPool = timingPool->getReduced(timing::REDUCE_TOTAL, 0); WALBERLA_ROOT_SECTION() { - WALBERLA_LOG_RESULT(*reducedTimingPool); + WALBERLA_LOG_RESULT(*reducedTimingPool) std::map<std::string, walberla::int64_t> integerProperties; std::map<std::string, double> realProperties; @@ -213,9 +239,8 @@ int main(int argc, char **argv) { stringProperties[it->first] = it->second; } - realProperties["total_min"] = real_c(timingPool["totalTime"].min()) / real_c(iterations); - realProperties["total_avg"] = real_c(timingPool["totalTime"].average() / real_c(iterations)); - realProperties["total_max"] = real_c(timingPool["totalTime"].max() / real_c(iterations)); + realProperties["total_min"] = real_c(timingPool->operator[]("totalTime").min()) / real_c(iterations); + realProperties["total_max"] = real_c(timingPool->operator[]("totalTime").max() / real_c(iterations)); integerProperties["cellsPerBlock0"] = int64_c(cellsPerBlock[0]); integerProperties["cellsPerBlock1"] = int64_c(cellsPerBlock[1]); @@ -237,6 +262,7 @@ int main(int argc, char **argv) { integerProperties["outerIterations"] = int64_c(outerIterations); stringProperties["layout"] = layoutStr; + stringProperties["packInfoName"] = packInfoName; stringProperties["SLURM_CLUSTER_NAME"] = fromEnv("SLURM_CLUSTER_NAME"); stringProperties["SLURM_CPUS_ON_NODE"] = fromEnv("SLURM_CPUS_ON_NODE"); @@ -258,7 +284,7 @@ int main(int argc, char **argv) { stringProperties["compilerFlags"] = std::string(WALBERLA_COMPILER_FLAGS); auto runId = sqlite::storeRunInSqliteDB(databaseFile, integerProperties, stringProperties, realProperties); - sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, timingPool, "TimingRoot"); + sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, *timingPool, "TimingRoot"); sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, *reducedTimingPool, "TimingReduced"); } } diff --git a/apps/benchmarks/CommunicationGPU/CommunicationGPU.py b/apps/benchmarks/CommunicationGPU/CommunicationGPU.py index 4de22a6ecf86be8a4a2ca841b4064746943f87b3..271e943e6e17280b136536375c76934cc09904e2 100644 --- a/apps/benchmarks/CommunicationGPU/CommunicationGPU.py +++ b/apps/benchmarks/CommunicationGPU/CommunicationGPU.py @@ -2,12 +2,13 @@ import os import waLBerla as wlb DB_FILE = os.environ.get('DB_FILE', "CommunicationGPU.sqlite3") +LOGLEVEL = 'info' class Scenario: def __init__(self, cells_per_block=(128, 128, 128), gpu_direct_comm=False, layout="fzyx", warmup_iterations=10, iterations=100, min_iterations=10, max_iterations=100, - time_for_benchmark=1, outer_iterations=1): + time_for_benchmark=1, outer_iterations=1, packinfo="MemcpyPackInfo"): self.cells_per_block = cells_per_block self.blocks_per_process = 1 @@ -22,6 +23,8 @@ class Scenario: self.time_for_benchmark = time_for_benchmark self.outer_iterations = outer_iterations + self.packinfo = packinfo + @wlb.member_callback def config(self, print_dict=True): from pprint import pformat @@ -42,6 +45,10 @@ class Scenario: 'maxIterations': self.max_iterations, 'timeForBenchmark': self.time_for_benchmark, 'outerIterations': self.outer_iterations, + 'pachinfo': self.packinfo + }, + 'Logging': { + 'logLevel': LOGLEVEL } } @@ -66,15 +73,18 @@ def single_run(): time_for_benchmark = 1 outer_iterations = 1 - scenarios.add(Scenario(cells_per_block=cells_per_block, - gpu_direct_comm=gpu_direct_comm, - layout=layout, - warmup_iterations=warmup_iterations, - iterations=iterations, - min_iterations=min_iterations, - max_iterations=max_iterations, - time_for_benchmark=time_for_benchmark, - outer_iterations=outer_iterations)) + for pack_info_name in ["MemcpyPackInfo", "UniformGeneratedGPUPdfPackInfo"]: + + scenarios.add(Scenario(cells_per_block=cells_per_block, + gpu_direct_comm=gpu_direct_comm, + layout=layout, + warmup_iterations=warmup_iterations, + iterations=iterations, + min_iterations=min_iterations, + max_iterations=max_iterations, + time_for_benchmark=time_for_benchmark, + outer_iterations=outer_iterations, + packinfo=pack_info_name)) wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}") diff --git a/apps/benchmarks/CommunicationGPU/CommunicationGPUCodeGen.py b/apps/benchmarks/CommunicationGPU/CommunicationGPUCodeGen.py new file mode 100644 index 0000000000000000000000000000000000000000..09ec10fde6415c27428a44237d5240d93aa2c4ab --- /dev/null +++ b/apps/benchmarks/CommunicationGPU/CommunicationGPUCodeGen.py @@ -0,0 +1,23 @@ +from pystencils import Target +from lbmpy import Stencil, LBStencil, LBMConfig, create_lb_method + +from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep +from lbmpy_walberla import generate_lbm_storage_specification + + +with CodeGeneration() as ctx: + + stencil = LBStencil(Stencil.D3Q27) + streaming_pattern = 'pull' + nonuniform = False + target = Target.GPU + data_type = "float64" + cpu_openmp = False + + lbm_config = LBMConfig(stencil=stencil) + method = create_lb_method(lbm_config=lbm_config) + + generate_lbm_storage_specification(ctx, "StorageSpec", method, lbm_config, + nonuniform=nonuniform, target=target, data_type=data_type) + + generate_info_header(ctx, 'CommunicationGPUInfoHeader') diff --git a/apps/benchmarks/CommunicationGPU/GPUPackPerformance.cpp b/apps/benchmarks/CommunicationGPU/GPUPackPerformance.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1f7d1ab2ed8e67ab3a49aec0f56451c08592c248 --- /dev/null +++ b/apps/benchmarks/CommunicationGPU/GPUPackPerformance.cpp @@ -0,0 +1,315 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file CommunicationGPU.cpp +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#include "blockforest/Initialization.h" + +#include "core/Environment.h" +#include "core/logging/Initialization.h" +#include "core/timing/RemainingTimeLogger.h" +#include "core/timing/TimingPool.h" +#include "core/math/all.h" +#include "core/mpi/all.h" + +#include "gpu/DeviceSelectMPI.h" +#include "gpu/FieldCopy.h" +#include "gpu/GPUWrapper.h" +#include "gpu/HostFieldAllocator.h" +#include "gpu/ParallelStreams.h" +#include "gpu/communication/UniformGPUScheme.h" +#include "gpu/communication/MemcpyPackInfo.h" + +#include "lbm_generated/field/AddToStorage.h" +#include "lbm_generated/field/PdfField.h" +#include "lbm_generated/gpu/AddToStorage.h" +#include "lbm_generated/gpu/GPUPdfField.h" +#include "lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h" + +#include "python_coupling/CreateConfig.h" +#include "python_coupling/PythonCallback.h" + +#include "sqlite/SQLite.h" + +#include "CommunicationGPUInfoHeader.h" + +#include <cmath> + +using namespace walberla; +using gpu::communication::UniformGPUScheme; +using StorageSpecification_T = lbm::StorageSpec; +using Stencil_T = StorageSpecification_T::Stencil; + +using Field_T = lbm_generated::PdfField< StorageSpecification_T >; +using GPUField_T = lbm_generated::GPUPdfField< StorageSpecification_T >; + + +int main(int argc, char **argv) { + mpi::Environment const env(argc, argv); + gpu::selectDeviceBasedOnMpiRank(); + + auto mpiManager = mpi::MPIManager::instance(); + for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg) { + if (mpiManager->isMPIInitialized()) + mpiManager->resetMPI(); + + WALBERLA_MPI_WORLD_BARRIER() + WALBERLA_GPU_CHECK(gpuPeekAtLastError()) + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// SETUP AND CONFIGURATION /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + auto config = *cfg; + logging::configureLogging(config); + + // Reading parameters + auto parameters = config->getOneBlock("Parameters"); + const std::string databaseFile = parameters.getParameter<std::string>("databaseFile", + "CommunicationGPU.sqlite"); + auto gpuDirectComm = parameters.getParameter<bool>("gpuDirectComm", false); + const std::string layoutStr = parameters.getParameter<std::string>("layout", "fzyx"); + + auto runCfg = config->getOneBlock("Run"); + const std::string packInfoName = runCfg.getParameter<std::string>("pachinfo", "UniformGeneratedGPUPdfPackInfo"); +// const uint_t warmupIterations = runCfg.getParameter<uint_t>("warmupIterations", 2); +// uint_t iterations = runCfg.getParameter<uint_t>("iterations", 10); +// const uint_t minIterations = runCfg.getParameter<uint_t>("minIterations", 2); +// const uint_t maxIterations = runCfg.getParameter<uint_t>("maxIterations", 100); +// +// const real_t timeForBenchmark = runCfg.getParameter<real_t>("timeForBenchmark", real_t(-1.0)); +// const uint_t outerIterations = runCfg.getParameter<uint_t>("outerIterations", 2); + + field::Layout layout; + if (layoutStr == "fzyx") + layout = field::fzyx; + else if (layoutStr == "zyxf") + layout = field::zyxf; + else { + WALBERLA_ABORT_NO_DEBUG_INFO("Unknown layout string " << layoutStr << ". Valid values are fzyx and zyxf.") + } + + + auto domainCfg = config->getOneBlock("Domain"); + const Vector3<uint_t> cellsPerBlock = domainCfg.getParameter<Vector3<uint_t> >("cellsPerBlock"); + uint_t const blocksPerProcess = domainCfg.getParameter<uint_t>("blocksPerProcess", 1); + + auto numProcesses = mpiManager->numProcesses(); + auto processes = math::getFactors3D(uint_c(numProcesses)); + auto blockDecomposition = math::getFactors3D(uint_c(numProcesses) * blocksPerProcess); + auto aabb = AABB(real_t(0), real_t(0), real_t(0), + real_c(cellsPerBlock[0] * processes[0] * blocksPerProcess), + real_c(cellsPerBlock[1] * processes[1] * blocksPerProcess), + real_c(cellsPerBlock[2] * processes[2] * blocksPerProcess)); + + auto blocks = blockforest::createUniformBlockGrid(aabb, + blockDecomposition[0], blockDecomposition[1], + blockDecomposition[2], + cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2], + processes[0], processes[1], processes[2], + true, true, true, //periodicity + false // keepGlobalBlockInformation + ); + + const StorageSpecification_T StorageSpec = StorageSpecification_T(); + auto allocator = make_shared<gpu::HostFieldAllocator<real_t> >(); // use pinned memory allocator for faster CPU-GPU memory transfers + const BlockDataID fieldCPU = lbm_generated::addPdfFieldToStorage(blocks, "field", StorageSpec, uint_c(1), layout, allocator); + const BlockDataID fieldGPU = lbm_generated::addGPUPdfFieldToStorage< Field_T >(blocks, fieldCPU, StorageSpec, "field GPU", true); + + gpu::fieldCpy<GPUField_T, Field_T>(blocks, fieldGPU, fieldCPU); + + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// COMMUNICATION SCHEME /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + std::shared_ptr<gpu::GeneratedGPUPackInfo> packInfo; + if(packInfoName == "MemcpyPackInfo") + { + packInfo = std::make_shared<gpu::communication::MemcpyPackInfo<GPUField_T >>(fieldGPU); + } + else if (packInfoName == "UniformGeneratedGPUPdfPackInfo") + { + packInfo = std::make_shared<lbm_generated::UniformGeneratedGPUPdfPackInfo< GPUField_T >>(fieldGPU); + } + else + { + WALBERLA_ABORT("packInfoName does not exist") + } + // WcTimer timer; + + uint_t packSize = 0; + + for (auto &block : *blocks) { + for (auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir) + { + packSize += packInfo->size(*dir, &block); + WALBERLA_LOG_INFO_ON_ROOT("For direction: " << stencil::dirToString[*dir] << " " << packInfo->size(*dir, &block) + << " bytes will be communicated") + } + } + + unsigned char *buffer; + WALBERLA_GPU_CHECK( gpuMalloc( &buffer, packSize)) + + unsigned char *beginPtr = buffer; + unsigned char *currentPtr = buffer; + + + for (auto &iBlock : *blocks) + { + for (auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir) + { + auto senderBlock = dynamic_cast< Block * >( &iBlock ); + packInfo->pack(*dir, currentPtr, senderBlock, nullptr); + currentPtr += packInfo->size(*dir, senderBlock); + } + } + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + + WALBERLA_CHECK_NOT_NULLPTR(currentPtr) + WALBERLA_CHECK_NOT_NULLPTR(beginPtr) + + currentPtr = beginPtr; + + for (auto &iBlock : *blocks) + { + for (auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir) + { + auto senderBlock = dynamic_cast< Block * >( &iBlock ); + packInfo->unpack(*dir, currentPtr, senderBlock, nullptr); + currentPtr += packInfo->size(*dir, senderBlock); + } + } + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// TIME STEP DEFINITIONS /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// commStart(); +// commWait(); +// +// WcTimer warmupTimer; +// warmupTimer.start(); +// for (uint_t warmupCounter = 0; warmupCounter < warmupIterations; ++warmupCounter) { +// commStart(); +// commWait(); +// WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) +// } +// warmupTimer.end(); +// +// +// auto estimatedTimePerIteration = warmupTimer.last() / real_c(warmupIterations); +// if (timeForBenchmark > 0) { +// iterations = uint_c(timeForBenchmark / estimatedTimePerIteration); +// if (iterations < minIterations) +// iterations = minIterations; +// if (iterations > maxIterations) +// iterations = maxIterations; +// } +// WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) +// +// mpi::broadcastObject(iterations); +// +// auto timingPool = std::make_shared<WcTimingPool>(); +// communication.enableTiming(timingPool); +// WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) +// WALBERLA_LOG_INFO_ON_ROOT("Running " << outerIterations << " outer iterations of size " << iterations) +// +// for (uint_t outerCtr = 0; outerCtr < outerIterations; ++outerCtr) { +// for (uint_t ctr = 0; ctr < iterations; ++ctr) { +// timingPool->operator[]("commStart").start(); +// commStart(); +// WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) +// timingPool->operator[]("commStart").end(); +// +// timingPool->operator[]("commWait").start(); +// commWait(); +// WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) +// timingPool->operator[]("commWait").end(); +// } +// } +// +// WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) +// +// auto reducedTimingPool = timingPool->getReduced(timing::REDUCE_TOTAL, 0); +// WALBERLA_ROOT_SECTION() { +// WALBERLA_LOG_RESULT(*reducedTimingPool) +// +// std::map<std::string, walberla::int64_t> integerProperties; +// std::map<std::string, double> realProperties; +// std::map<std::string, std::string> stringProperties; +// +// auto databaseBlock = config->getBlock("Database"); +// if (databaseBlock) { +// for (auto it = databaseBlock.begin(); it != databaseBlock.end(); ++it) +// stringProperties[it->first] = it->second; +// } +// +// realProperties["total_min"] = real_c(timingPool->operator[]("totalTime").min()) / real_c(iterations); +// realProperties["total_max"] = real_c(timingPool->operator[]("totalTime").max() / real_c(iterations)); +// +// integerProperties["cellsPerBlock0"] = int64_c(cellsPerBlock[0]); +// integerProperties["cellsPerBlock1"] = int64_c(cellsPerBlock[1]); +// integerProperties["cellsPerBlock2"] = int64_c(cellsPerBlock[2]); +// +// integerProperties["processes0"] = int64_c(processes[0]); +// integerProperties["processes1"] = int64_c(processes[1]); +// integerProperties["processes2"] = int64_c(processes[2]); +// +// integerProperties["blocks0"] = int64_c(blockDecomposition[0]); +// integerProperties["blocks1"] = int64_c(blockDecomposition[1]); +// integerProperties["blocks2"] = int64_c(blockDecomposition[2]); +// +// integerProperties["blocksPerProcess"] = int64_c(blocksPerProcess); +// integerProperties["cartesianCommunicator"] = mpiManager->hasCartesianSetup(); +// +// integerProperties["warmupIterations"] = int64_c(warmupIterations); +// integerProperties["iterations"] = int64_c(iterations); +// integerProperties["outerIterations"] = int64_c(outerIterations); +// +// stringProperties["layout"] = layoutStr; +// +// stringProperties["SLURM_CLUSTER_NAME"] = fromEnv("SLURM_CLUSTER_NAME"); +// stringProperties["SLURM_CPUS_ON_NODE"] = fromEnv("SLURM_CPUS_ON_NODE"); +// stringProperties["SLURM_CPUS_PER_TASK"] = fromEnv("SLURM_CPUS_PER_TASK"); +// stringProperties["SLURM_JOB_ACCOUNT"] = fromEnv("SLURM_JOB_ACCOUNT"); +// stringProperties["SLURM_JOB_ID"] = fromEnv("SLURM_JOB_ID"); +// stringProperties["SLURM_JOB_CPUS_PER_NODE"] = fromEnv("SLURM_JOB_CPUS_PER_NODE"); +// stringProperties["SLURM_JOB_NAME"] = fromEnv("SLURM_JOB_NAME"); +// stringProperties["SLURM_JOB_NUM_NODES"] = fromEnv("SLURM_JOB_NUM_NODES"); +// stringProperties["SLURM_NTASKS"] = fromEnv("SLURM_NTASKS"); +// stringProperties["SLURM_NTASKS_PER_CORE"] = fromEnv("SLURM_NTASKS_PER_CORE"); +// stringProperties["SLURM_NTASKS_PER_NODE"] = fromEnv("SLURM_NTASKS_PER_NODE"); +// stringProperties["SLURM_NTASKS_PER_SOCKET"] = fromEnv("SLURM_NTASKS_PER_SOCKET"); +// stringProperties["SLURM_TASKS_PER_NODE"] = fromEnv("SLURM_TASKS_PER_NODE"); +// +// stringProperties["buildMachine"] = std::string(WALBERLA_BUILD_MACHINE); +// stringProperties["gitVersion"] = std::string(WALBERLA_GIT_SHA1); +// stringProperties["buildType"] = std::string(WALBERLA_BUILD_TYPE); +// stringProperties["compilerFlags"] = std::string(WALBERLA_COMPILER_FLAGS); +// +// auto runId = sqlite::storeRunInSqliteDB(databaseFile, integerProperties, stringProperties, realProperties); +// sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, *timingPool, "TimingRoot"); +// sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, *reducedTimingPool, "TimingReduced"); +// } + } + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/apps/benchmarks/CommunicationGPU/GPUPackPerformance.py b/apps/benchmarks/CommunicationGPU/GPUPackPerformance.py new file mode 100644 index 0000000000000000000000000000000000000000..c0cd8969395eacaee42156b326682eb8b191b665 --- /dev/null +++ b/apps/benchmarks/CommunicationGPU/GPUPackPerformance.py @@ -0,0 +1,90 @@ +import os +import waLBerla as wlb + +DB_FILE = os.environ.get('DB_FILE', "CommunicationGPU.sqlite3") +LOGLEVEL = 'info' + + +class Scenario: + def __init__(self, cells_per_block=(128, 128, 128), gpu_direct_comm=False, layout="fzyx", + warmup_iterations=10, iterations=100, min_iterations=10, max_iterations=100, + time_for_benchmark=1, outer_iterations=1, packinfo="MemcpyPackInfo"): + self.cells_per_block = cells_per_block + self.blocks_per_process = 1 + + self.database_file = DB_FILE + self.gpu_direct_comm = gpu_direct_comm + self.layout = layout + + self.warmup_iterations = warmup_iterations + self.iterations = iterations + self.min_iterations = min_iterations + self.max_iterations = max_iterations + self.time_for_benchmark = time_for_benchmark + self.outer_iterations = outer_iterations + + self.packinfo = packinfo + + @wlb.member_callback + def config(self, print_dict=True): + from pprint import pformat + config_dict = { + 'Domain': { + 'cellsPerBlock': self.cells_per_block, + 'blocksPerProcess': self.blocks_per_process, + }, + 'Parameters': { + 'databaseFile': self.database_file, + 'gpuDirectComm': self.gpu_direct_comm, + 'layout': self.layout, + }, + 'Run': { + 'warmupIterations': self.warmup_iterations, + 'iterations': self.iterations, + 'minIterations': self.min_iterations, + 'maxIterations': self.max_iterations, + 'timeForBenchmark': self.time_for_benchmark, + 'outerIterations': self.outer_iterations, + 'pachinfo': self.packinfo + }, + 'Logging': { + 'logLevel': LOGLEVEL + } + } + + if print_dict: + wlb.log_info_on_root("Scenario:\n" + pformat(config_dict)) + return config_dict + + +# -------------------------------------- Profiling ----------------------------------- +def single_run(): + """Tests different communication overlapping strategies""" + + scenarios = wlb.ScenarioManager() + + cells_per_block = (128, 128, 128) + gpu_direct_comm = False + layout = "fzyx" + warmup_iterations = 10 + iterations = 100 + min_iterations = 10 + max_iterations = 100 + time_for_benchmark = 1 + outer_iterations = 1 + + for pack_info_name in ["MemcpyPackInfo", "UniformGeneratedGPUPdfPackInfo"]: + scenarios.add(Scenario(cells_per_block=cells_per_block, + gpu_direct_comm=gpu_direct_comm, + layout=layout, + warmup_iterations=warmup_iterations, + iterations=iterations, + min_iterations=min_iterations, + max_iterations=max_iterations, + time_for_benchmark=time_for_benchmark, + outer_iterations=outer_iterations, + packinfo=pack_info_name)) + + +wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}") +single_run() diff --git a/python/lbmpy_walberla/storage_specification.py b/python/lbmpy_walberla/storage_specification.py index c113604381be85e3895e3744285a528d4786f84e..01287a764809c96c43de08eb357c9336f5a606d8 100644 --- a/python/lbmpy_walberla/storage_specification.py +++ b/python/lbmpy_walberla/storage_specification.py @@ -39,9 +39,6 @@ def generate_lbm_storage_specification(generation_context, class_name: str, if nonuniform: kernels = cg.create_nonuniform_kernel_families(kernels_dict=kernels) - values_per_cell = len(stencil) - dimension = len(stencil[0]) - # Pure storage specification if not stencil_name: raise ValueError("lb_method uses a stencil that is not supported in waLBerla") diff --git a/src/core/mpi/BufferSystemHelper.impl.h b/src/core/mpi/BufferSystemHelper.impl.h index e6236f100ca3d8f58817824c16ac5844e41f4af6..52236a7ec4cc03ef2a47966ed699be67a625016d 100644 --- a/src/core/mpi/BufferSystemHelper.impl.h +++ b/src/core/mpi/BufferSystemHelper.impl.h @@ -147,7 +147,7 @@ MPIRank KnownSizeCommunication<Rb, Sb>::waitForNextReceive( std::map<MPIRank, Re recvRequests_[ uint_c( requestIndex ) ] = MPI_REQUEST_NULL; - MPIRank senderRank = status.MPI_SOURCE; + MPIRank const senderRank = status.MPI_SOURCE; WALBERLA_ASSERT_GREATER_EQUAL( senderRank, 0 ); diff --git a/src/gpu/communication/MemcpyPackInfo.h b/src/gpu/communication/MemcpyPackInfo.h index 6c15988f4f2687275fea7f0f8be36b2e7d99fcf6..7a36f0e576af29dc3b93e96b769e17d3377779bc 100644 --- a/src/gpu/communication/MemcpyPackInfo.h +++ b/src/gpu/communication/MemcpyPackInfo.h @@ -17,7 +17,7 @@ template<typename GPUFieldType> class MemcpyPackInfo : public ::walberla::gpu::GeneratedGPUPackInfo { public: - MemcpyPackInfo( BlockDataID pdfsID_ ) : pdfsID(pdfsID_) {}; + MemcpyPackInfo( BlockDataID fieldID_ ) : fieldID(fieldID_) {}; ~MemcpyPackInfo() override = default; void pack (stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream) override; @@ -26,7 +26,7 @@ public: uint_t size(stencil::Direction dir, IBlock * block) override; private: - BlockDataID pdfsID; + BlockDataID fieldID; uint_t numberOfGhostLayers_{0}; bool communicateAllGhostLayers_{true}; diff --git a/src/gpu/communication/MemcpyPackInfo.impl.h b/src/gpu/communication/MemcpyPackInfo.impl.h index 2110933cda5322828f40cc14b471be5c6a309bfe..843166a55a3ab24db491c9668e85720e51c0aea3 100644 --- a/src/gpu/communication/MemcpyPackInfo.impl.h +++ b/src/gpu/communication/MemcpyPackInfo.impl.h @@ -19,7 +19,7 @@ void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char IBlock * block, gpuStream_t stream) { // Extract field data pointer from the block - const GPUFieldType * fieldPtr = block->getData< GPUFieldType >( pdfsID ); + const GPUFieldType * fieldPtr = block->getData< GPUFieldType >( fieldID ); WALBERLA_ASSERT_NOT_NULLPTR( fieldPtr ) // cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) ); @@ -68,12 +68,9 @@ void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char template<typename GPUFieldType> void MemcpyPackInfo< GPUFieldType >::communicateLocal( stencil::Direction dir, const IBlock* sender, IBlock* receiver, gpuStream_t stream ) { - // WALBERLA_ABORT("The MemcpyPackInfo does not provide a thread safe local communication. Thus is can not be used in local mode. To use it set local useLocalCommunication to false in the communication scheme") - - // Extract field data pointer from the block - const GPUFieldType * senderFieldPtr = sender->getData< GPUFieldType >( pdfsID ); - const GPUFieldType * receiverFieldPtr = receiver->getData< GPUFieldType >( pdfsID ); + const GPUFieldType * senderFieldPtr = sender->getData< GPUFieldType >( fieldID ); + const GPUFieldType * receiverFieldPtr = receiver->getData< GPUFieldType >( fieldID ); WALBERLA_ASSERT_NOT_NULLPTR( senderFieldPtr ) WALBERLA_ASSERT_NOT_NULLPTR( receiverFieldPtr ) @@ -128,7 +125,7 @@ template<typename GPUFieldType> void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned char * byte_buffer, IBlock * block, gpuStream_t stream) { - GPUFieldType * fieldPtr = block->getData< GPUFieldType >( pdfsID ); + GPUFieldType * fieldPtr = block->getData< GPUFieldType >( fieldID ); WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr) cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) ); @@ -173,100 +170,13 @@ void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned cha template<typename GPUFieldType> uint_t MemcpyPackInfo< GPUFieldType >::size(stencil::Direction dir, IBlock * block) { - auto pdfs = block->getData< GPUFieldType >(pdfsID); + auto field = block->getData< GPUFieldType >(fieldID); CellInterval ci; - cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( pdfs ) ); - pdfs->getGhostRegion(dir, ci, nrOfGhostLayers, false); - - /* - uint_t elementsPerCell = 0; - - switch( dir ) - { - case stencil::SW: - elementsPerCell = 1; - break; - - case stencil::S: - elementsPerCell = 5; - break; - - case stencil::W: - elementsPerCell = 5; - break; - - case stencil::B: - elementsPerCell = 5; - break; - - case stencil::T: - elementsPerCell = 5; - break; - - case stencil::BN: - elementsPerCell = 1; - break; - - case stencil::N: - elementsPerCell = 5; - break; - - case stencil::TE: - elementsPerCell = 1; - break; - - case stencil::E: - elementsPerCell = 5; - break; - - case stencil::BE: - elementsPerCell = 1; - break; - - case stencil::SE: - elementsPerCell = 1; - break; - - case stencil::C: - elementsPerCell = 1; - break; - - case stencil::TN: - elementsPerCell = 1; - break; - - case stencil::TS: - elementsPerCell = 1; - break; - - case stencil::NE: - elementsPerCell = 1; - break; - - case stencil::BW: - elementsPerCell = 1; - break; - - case stencil::NW: - elementsPerCell = 1; - break; - - case stencil::BS: - elementsPerCell = 1; - break; - - case stencil::TW: - elementsPerCell = 1; - break; - - default: - elementsPerCell = 0; - } - - return ci.numCells() * elementsPerCell * sizeof(typename GPUFieldType::value_type); - */ - uint_t totalCells = ci.numCells() * pdfs->fSize() * sizeof(typename GPUFieldType::value_type); + cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( field ) ); + field->getGhostRegion(dir, ci, nrOfGhostLayers, false); + + uint_t totalCells = ci.numCells() * field->fSize() * sizeof(typename GPUFieldType::value_type); return totalCells; } diff --git a/src/gpu/communication/UniformGPUScheme.h b/src/gpu/communication/UniformGPUScheme.h index 5c9604ccd8cc00e5cdb2d9f9c1085ace2f2e44a5..9057bf7b196d9b64fe911d1bcde0651fd361c5da 100644 --- a/src/gpu/communication/UniformGPUScheme.h +++ b/src/gpu/communication/UniformGPUScheme.h @@ -18,7 +18,6 @@ //! \author Martin Bauer <martin.bauer@fau.de> // //====================================================================================================================== - #pragma once #include "blockforest/StructuredBlockForest.h" @@ -30,48 +29,52 @@ #include "stencil/Directions.h" -#include <thread> - -#include "gpu/GPURAII.h" #include "gpu/GPUWrapper.h" -#include "gpu/ParallelStreams.h" #include "gpu/communication/CustomMemoryBuffer.h" #include "gpu/communication/GeneratedGPUPackInfo.h" +#include <thread> + namespace walberla { namespace gpu { namespace communication { - - template<typename Stencil> class UniformGPUScheme { public: - explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf, + explicit UniformGPUScheme( const weak_ptr<StructuredBlockForest>& bf, bool sendDirectlyFromGPU = false, bool useLocalCommunication = true, const int tag = 5432 ); - explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf, + explicit UniformGPUScheme( const weak_ptr<StructuredBlockForest>& bf, const Set<SUID> & requiredBlockSelectors, const Set<SUID> & incompatibleBlockSelectors, bool sendDirectlyFromGPU = false, bool useLocalCommunication = true, const int tag = 5432 ); + ~UniformGPUScheme(); + void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi ); - void startCommunication( gpuStream_t stream = nullptr); - void wait( gpuStream_t stream = nullptr); + void startCommunication(); + void wait(); void operator()( gpuStream_t stream = nullptr ) { communicate( stream ); } inline void communicate( gpuStream_t stream = nullptr ) { startCommunication(stream); wait(stream); } - std::function<void()> getCommunicateFunctor( gpuStream_t stream = nullptr ); - std::function<void()> getStartCommunicateFunctor( gpuStream_t stream = nullptr ); - std::function<void()> getWaitFunctor( gpuStream_t stream = nullptr ); + std::function<void()> getCommunicateFunctor(); + std::function<void()> getStartCommunicateFunctor(); + std::function<void()> getWaitFunctor(); + + void enableTiming( const shared_ptr<WcTimingPool> & timingPool ) + { + timing_ = true; + timingPool_ = timingPool; + } private: void setupCommunication(); @@ -92,8 +95,6 @@ namespace communication { std::vector<shared_ptr<GeneratedGPUPackInfo> > packInfos_; - ParallelStreams parallelSectionManager_; - struct Header { BlockID blockId; @@ -103,6 +104,11 @@ namespace communication { Set<SUID> requiredBlockSelectors_; Set<SUID> incompatibleBlockSelectors_; + + bool timing_{false}; + shared_ptr<WcTimingPool> timingPool_; + + gpuStream_t streams_[Stencil::Q]; }; diff --git a/src/gpu/communication/UniformGPUScheme.impl.h b/src/gpu/communication/UniformGPUScheme.impl.h index 28033d1464f76709178042a5c87a27485ceacb44..3c885842f3a5c9f9a9eb50e6b10fbf3f1c85e466 100644 --- a/src/gpu/communication/UniformGPUScheme.impl.h +++ b/src/gpu/communication/UniformGPUScheme.impl.h @@ -18,9 +18,6 @@ //! \author Martin Bauer <martin.bauer@fau.de> // //====================================================================================================================== - -#include "gpu/ParallelStreams.h" - namespace walberla { namespace gpu { @@ -28,7 +25,7 @@ namespace communication { template<typename Stencil> - UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf, + UniformGPUScheme<Stencil>::UniformGPUScheme( const weak_ptr <StructuredBlockForest>& bf, bool sendDirectlyFromGPU, bool useLocalCommunication, const int tag ) @@ -39,7 +36,6 @@ namespace communication { useLocalCommunication_(useLocalCommunication), bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ), bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ), - parallelSectionManager_( -1 ), requiredBlockSelectors_( Set<SUID>::emptySet() ), incompatibleBlockSelectors_( Set<SUID>::emptySet() ) { @@ -49,10 +45,15 @@ namespace communication { WALBERLA_CHECK(!sendDirectlyFromGPU) #endif } + + for (uint_t i = 0; i < Stencil::Q; ++i) + { + WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i])) + } } template<typename Stencil> - UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf, + UniformGPUScheme<Stencil>::UniformGPUScheme( const weak_ptr <StructuredBlockForest>& bf, const Set<SUID> & requiredBlockSelectors, const Set<SUID> & incompatibleBlockSelectors, bool sendDirectlyFromGPU, @@ -65,7 +66,6 @@ namespace communication { useLocalCommunication_(useLocalCommunication), bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ), bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ), - parallelSectionManager_( -1 ), requiredBlockSelectors_( requiredBlockSelectors ), incompatibleBlockSelectors_( incompatibleBlockSelectors ) { @@ -75,11 +75,25 @@ namespace communication { WALBERLA_CHECK(!sendDirectlyFromGPU) #endif } + + for (uint_t i = 0; i < Stencil::Q; ++i) + { + WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i])) + } } + template< typename Stencil > + UniformGPUScheme< Stencil >::~UniformGPUScheme() + { + for (uint_t i = 0; i < Stencil::Q; ++i) + { + WALBERLA_GPU_CHECK(gpuStreamDestroy(streams_[i])) + } + } + template<typename Stencil> - void UniformGPUScheme<Stencil>::startCommunication( gpuStream_t stream ) + void UniformGPUScheme<Stencil>::startCommunication() { WALBERLA_ASSERT( !communicationInProgress_ ) auto forest = blockForest_.lock(); @@ -99,9 +113,11 @@ namespace communication { for( auto it : headers_ ) bufferSystemGPU_.sendBuffer( it.first ).clear(); + if(timing_) + timingPool_->operator[]("UniformGPUScheme->startCommunication").start(); + // Start filling send buffers { - auto parallelSection = parallelSectionManager_.parallelSection( stream ); for( auto &iBlock : *forest ) { auto senderBlock = dynamic_cast< Block * >( &iBlock ); @@ -124,7 +140,7 @@ namespace communication { auto receiverBlock = dynamic_cast< Block * >( forest->getBlock( senderBlock->getNeighborId( neighborIdx, uint_t(0) )) ); for (auto& pi : packInfos_) { - pi->communicateLocal(*dir, senderBlock, receiverBlock, stream); + pi->communicateLocal(*dir, senderBlock, receiverBlock, streams_[*dir]); } } else @@ -133,26 +149,29 @@ namespace communication { for( auto &pi : packInfos_ ) { - parallelSection.run([&](auto s) { auto size = pi->size( *dir, senderBlock ); auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ) - pi->pack( *dir, gpuDataPtr, senderBlock, s ); + pi->pack( *dir, gpuDataPtr, senderBlock, streams_[*dir] ); if( !sendFromGPU_ ) { auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr ) - WALBERLA_GPU_CHECK( gpuMemcpyAsync( cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, s )) + WALBERLA_GPU_CHECK( gpuMemcpyAsync( cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[*dir] )) } - }); } } } } } // wait for packing to finish - WALBERLA_GPU_CHECK( gpuStreamSynchronize( stream ) ); + for (uint_t i = 0; i < Stencil::Q; ++i) + { + WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) + } + if(timing_) + timingPool_->operator[]("UniformGPUScheme->startCommunication").end(); if( sendFromGPU_ ) bufferSystemGPU_.sendAll(); @@ -164,15 +183,17 @@ namespace communication { template<typename Stencil> - void UniformGPUScheme<Stencil>::wait( gpuStream_t stream ) + void UniformGPUScheme<Stencil>::wait() { WALBERLA_ASSERT( communicationInProgress_ ) auto forest = blockForest_.lock(); + if(timing_) + timingPool_->operator[]("UniformGPUScheme->wait").start(); + if( sendFromGPU_ ) { - auto parallelSection = parallelSectionManager_.parallelSection( stream ); for( auto recvInfo = bufferSystemGPU_.begin(); recvInfo != bufferSystemGPU_.end(); ++recvInfo ) { recvInfo.buffer().clear(); @@ -185,16 +206,13 @@ namespace communication { auto size = pi->size( header.dir, block ); auto gpuDataPtr = recvInfo.buffer().advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ) - parallelSection.run([&](auto s) { - pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s ); - }); + pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, streams_[stencil::inverseDir[header.dir]] ); } } } } else { - auto parallelSection = parallelSectionManager_.parallelSection( stream ); for( auto recvInfo = bufferSystemCPU_.begin(); recvInfo != bufferSystemCPU_.end(); ++recvInfo ) { auto &gpuBuffer = bufferSystemGPU_.sendBuffer( recvInfo.rank()); @@ -211,17 +229,20 @@ namespace communication { auto gpuDataPtr = gpuBuffer.advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr ) WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ) - parallelSection.run([&](auto s) { WALBERLA_GPU_CHECK( gpuMemcpyAsync( gpuDataPtr, cpuDataPtr, size, - gpuMemcpyHostToDevice, s )) - pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s ); - }); + gpuMemcpyHostToDevice, streams_[stencil::inverseDir[header.dir]] )) + pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, streams_[stencil::inverseDir[header.dir]] ); } } } } - WALBERLA_GPU_CHECK( gpuDeviceSynchronize() ) + for (uint_t i = 0; i < Stencil::Q; ++i) + { + WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) + } + if(timing_) + timingPool_->operator[]("UniformGPUScheme->wait").end(); communicationInProgress_ = false; } @@ -309,21 +330,21 @@ namespace communication { } template< typename Stencil > - std::function<void()> UniformGPUScheme<Stencil>::getCommunicateFunctor(gpuStream_t stream) + std::function<void()> UniformGPUScheme<Stencil>::getCommunicateFunctor() { - return [this, stream]() { communicate( stream ); }; + return [this]() { communicate(); }; } template< typename Stencil > - std::function<void()> UniformGPUScheme<Stencil>::getStartCommunicateFunctor(gpuStream_t stream) + std::function<void()> UniformGPUScheme<Stencil>::getStartCommunicateFunctor() { - return [this, stream]() { startCommunication( stream ); }; + return [this]() { startCommunication(); }; } template< typename Stencil > - std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor(cudaStream_t stream) + std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor() { - return [this, stream]() { wait( stream ); }; + return [this]() { wait(); }; } } // namespace communication