diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt index f37d24767eb383e55b1ff2764770ee525bb54c68..e6799acb7b4247371d9979f175b3064b873e1f43 100644 --- a/apps/benchmarks/CMakeLists.txt +++ b/apps/benchmarks/CMakeLists.txt @@ -28,11 +28,14 @@ if ( WALBERLA_BUILD_WITH_PYTHON ) add_subdirectory( NonUniformGridCPU ) endif() - if ( WALBERLA_BUILD_WITH_CODEGEN AND WALBERLA_BUILD_WITH_GPU_SUPPORT ) - add_subdirectory( UniformGridGPU ) - add_subdirectory( NonUniformGridGPU ) - endif() + if(WALBERLA_BUILD_WITH_GPU_SUPPORT) + add_subdirectory(CommunicationGPU) + if ( WALBERLA_BUILD_WITH_CODEGEN ) + add_subdirectory( UniformGridGPU ) + add_subdirectory( NonUniformGridGPU ) + endif() + endif () endif() diff --git a/apps/benchmarks/CommunicationGPU/CMakeLists.txt b/apps/benchmarks/CommunicationGPU/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..229e0a4a4164f5c5480c161dfb10fa134493c00a --- /dev/null +++ b/apps/benchmarks/CommunicationGPU/CMakeLists.txt @@ -0,0 +1,7 @@ +waLBerla_link_files_to_builddir( "*.prm" ) +waLBerla_link_files_to_builddir( "*.dat" ) +waLBerla_link_files_to_builddir( "*.py" ) + + +waLBerla_add_executable ( NAME CommunicationGPU + DEPENDS blockforest core domain_decomposition field gpu postprocessing sqlite python_coupling ) diff --git a/apps/benchmarks/CommunicationGPU/CommunicationGPU.cpp b/apps/benchmarks/CommunicationGPU/CommunicationGPU.cpp new file mode 100644 index 0000000000000000000000000000000000000000..32a504072d5e776ac9e0a89bd1669a636cb6416d --- /dev/null +++ b/apps/benchmarks/CommunicationGPU/CommunicationGPU.cpp @@ -0,0 +1,266 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file CommunicationGPU.cpp +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#include "blockforest/Initialization.h" + +#include "core/Environment.h" +#include "core/logging/Initialization.h" +#include "core/timing/RemainingTimeLogger.h" +#include "core/timing/TimingPool.h" +#include "core/math/all.h" +#include "core/mpi/all.h" +#include "core/waLBerlaBuildInfo.h" + +#include "field/AddToStorage.h" + +#include "gpu/AddGPUFieldToStorage.h" +#include "gpu/DeviceSelectMPI.h" +#include "gpu/FieldCopy.h" +#include "gpu/GPUField.h" +#include "gpu/GPUWrapper.h" +#include "gpu/HostFieldAllocator.h" +#include "gpu/ParallelStreams.h" +#include "gpu/communication/UniformGPUScheme.h" +#include "gpu/communication/MemcpyPackInfo.h" + +#include "python_coupling/CreateConfig.h" +#include "python_coupling/PythonCallback.h" + +#include "stencil/D3Q27.h" + +#include "sqlite/SQLite.h" + +#include <cmath> + +using namespace walberla; +using gpu::communication::UniformGPUScheme; +using Field_T = field::GhostLayerField<real_t, 1>; +using GPUField_T = gpu::GPUField<real_t>; + +using Stencil_T = stencil::D3Q27; + +std::string fromEnv(const char *envVar) { + auto env = std::getenv(envVar); + return env != nullptr ? std::string(env) : ""; +} + +int main(int argc, char **argv) { + mpi::Environment const env(argc, argv); + gpu::selectDeviceBasedOnMpiRank(); + + auto mpiManager = mpi::MPIManager::instance(); + for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg) { + if (mpiManager->isMPIInitialized()) + mpiManager->resetMPI(); + + WALBERLA_MPI_WORLD_BARRIER() + WALBERLA_GPU_CHECK(gpuPeekAtLastError()) + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// SETUP AND CONFIGURATION /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + auto config = *cfg; + logging::configureLogging(config); + + // Reading parameters + auto parameters = config->getOneBlock("Parameters"); + const std::string databaseFile = parameters.getParameter<std::string>("databaseFile", + "CommunicationGPU.sqlite"); + auto gpuDirectComm = parameters.getParameter<bool>("gpuDirectComm", false); + const std::string layoutStr = parameters.getParameter<std::string>("layout", "fzyx"); + + auto runCfg = config->getOneBlock("Run"); + const uint_t warmupIterations = runCfg.getParameter<uint_t>("warmupIterations", 2); + uint_t iterations = runCfg.getParameter<uint_t>("iterations", 10); + const uint_t minIterations = runCfg.getParameter<uint_t>("minIterations", 2); + const uint_t maxIterations = runCfg.getParameter<uint_t>("maxIterations", 100); + + const real_t timeForBenchmark = runCfg.getParameter<real_t>("timeForBenchmark", real_t(-1.0)); + const uint_t outerIterations = runCfg.getParameter<uint_t>("outerIterations", 2); + + field::Layout layout; + if (layoutStr == "fzyx") + layout = field::fzyx; + else if (layoutStr == "zyxf") + layout = field::zyxf; + else { + WALBERLA_ABORT_NO_DEBUG_INFO("Unknown layout string " << layoutStr << ". Valid values are fzyx and zyxf.") + } + + + auto domainCfg = config->getOneBlock("Domain"); + const Vector3<uint_t> cellsPerBlock = domainCfg.getParameter<Vector3<uint_t> >("cellsPerBlock"); + uint_t const blocksPerProcess = domainCfg.getParameter<uint_t>("blocksPerProcess", 1); + + auto numProcesses = mpiManager->numProcesses(); + auto processes = math::getFactors3D(uint_c(numProcesses)); + auto blockDecomposition = math::getFactors3D(uint_c(numProcesses) * blocksPerProcess); + auto aabb = AABB(real_t(0), real_t(0), real_t(0), + real_c(cellsPerBlock[0] * processes[0] * blocksPerProcess), + real_c(cellsPerBlock[1] * processes[1] * blocksPerProcess), + real_c(cellsPerBlock[2] * processes[2] * blocksPerProcess)); + + auto blocks = blockforest::createUniformBlockGrid(aabb, + blockDecomposition[0], blockDecomposition[1], + blockDecomposition[2], + cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2], + processes[0], processes[1], processes[2], + true, true, true, //periodicity + false // keepGlobalBlockInformation + ); + + auto rank = mpiManager->rank(); + + auto allocator = make_shared<gpu::HostFieldAllocator<real_t> >(); // use pinned memory allocator for faster CPU-GPU memory transfers + const BlockDataID fieldCPU = field::addToStorage<Field_T>(blocks, "field", real_c(rank), layout, uint_c(1), + allocator); + const BlockDataID fieldGPU = gpu::addGPUFieldToStorage<Field_T>(blocks, fieldCPU, "field GPU", true); + + gpu::fieldCpy<GPUField_T, Field_T>(blocks, fieldGPU, fieldCPU); + + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// COMMUNICATION SCHEME /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + UniformGPUScheme<Stencil_T> communication(blocks, gpuDirectComm); + auto packInfo = std::make_shared<gpu::communication::MemcpyPackInfo<GPUField_T >>(fieldGPU); + communication.addPackInfo(packInfo); + + auto communicate = communication.getCommunicateFunctor(); + auto commStart = communication.getStartCommunicateFunctor(); + auto commWait = communication.getWaitFunctor(); + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// TIME STEP DEFINITIONS /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + commStart(); + commWait(); + + WcTimer warmupTimer; + warmupTimer.start(); + for (uint_t warmupCounter = 0; warmupCounter < warmupIterations; ++warmupCounter) { + commStart(); + commWait(); + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + } + warmupTimer.end(); + + + auto estimatedTimePerIteration = warmupTimer.last() / real_c(warmupIterations); + if (timeForBenchmark > 0) { + iterations = uint_c(timeForBenchmark / estimatedTimePerIteration); + if (iterations < minIterations) + iterations = minIterations; + if (iterations > maxIterations) + iterations = maxIterations; + } + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + + mpi::broadcastObject(iterations); + + WcTimingPool timingPool; + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + WALBERLA_LOG_INFO_ON_ROOT("Running " << outerIterations << " outer iterations of size " << iterations); + for (uint_t outerCtr = 0; outerCtr < outerIterations; ++outerCtr) { + timingPool["totalTime"].start(); + for (uint_t ctr = 0; ctr < iterations; ++ctr) { + timingPool["commStart"].start(); + commStart(); + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + timingPool["commStart"].end(); + + timingPool["commWait"].start(); + commWait(); + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + timingPool["commWait"].end(); + } + timingPool["totalTime"].end(); + } + + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + + auto reducedTimingPool = timingPool.getReduced(timing::REDUCE_TOTAL, 0); + WALBERLA_ROOT_SECTION() { + WALBERLA_LOG_RESULT(*reducedTimingPool); + + std::map<std::string, walberla::int64_t> integerProperties; + std::map<std::string, double> realProperties; + std::map<std::string, std::string> stringProperties; + + auto databaseBlock = config->getBlock("Database"); + if (databaseBlock) { + for (auto it = databaseBlock.begin(); it != databaseBlock.end(); ++it) + stringProperties[it->first] = it->second; + } + + realProperties["total_min"] = real_c(timingPool["totalTime"].min()) / real_c(iterations); + realProperties["total_avg"] = real_c(timingPool["totalTime"].average() / real_c(iterations)); + realProperties["total_max"] = real_c(timingPool["totalTime"].max() / real_c(iterations)); + + integerProperties["cellsPerBlock0"] = int64_c(cellsPerBlock[0]); + integerProperties["cellsPerBlock1"] = int64_c(cellsPerBlock[1]); + integerProperties["cellsPerBlock2"] = int64_c(cellsPerBlock[2]); + + integerProperties["processes0"] = int64_c(processes[0]); + integerProperties["processes1"] = int64_c(processes[1]); + integerProperties["processes2"] = int64_c(processes[2]); + + integerProperties["blocks0"] = int64_c(blockDecomposition[0]); + integerProperties["blocks1"] = int64_c(blockDecomposition[1]); + integerProperties["blocks2"] = int64_c(blockDecomposition[2]); + + integerProperties["blocksPerProcess"] = int64_c(blocksPerProcess); + integerProperties["cartesianCommunicator"] = mpiManager->hasCartesianSetup(); + + integerProperties["warmupIterations"] = int64_c(warmupIterations); + integerProperties["iterations"] = int64_c(iterations); + integerProperties["outerIterations"] = int64_c(outerIterations); + + stringProperties["layout"] = layoutStr; + + stringProperties["SLURM_CLUSTER_NAME"] = fromEnv("SLURM_CLUSTER_NAME"); + stringProperties["SLURM_CPUS_ON_NODE"] = fromEnv("SLURM_CPUS_ON_NODE"); + stringProperties["SLURM_CPUS_PER_TASK"] = fromEnv("SLURM_CPUS_PER_TASK"); + stringProperties["SLURM_JOB_ACCOUNT"] = fromEnv("SLURM_JOB_ACCOUNT"); + stringProperties["SLURM_JOB_ID"] = fromEnv("SLURM_JOB_ID"); + stringProperties["SLURM_JOB_CPUS_PER_NODE"] = fromEnv("SLURM_JOB_CPUS_PER_NODE"); + stringProperties["SLURM_JOB_NAME"] = fromEnv("SLURM_JOB_NAME"); + stringProperties["SLURM_JOB_NUM_NODES"] = fromEnv("SLURM_JOB_NUM_NODES"); + stringProperties["SLURM_NTASKS"] = fromEnv("SLURM_NTASKS"); + stringProperties["SLURM_NTASKS_PER_CORE"] = fromEnv("SLURM_NTASKS_PER_CORE"); + stringProperties["SLURM_NTASKS_PER_NODE"] = fromEnv("SLURM_NTASKS_PER_NODE"); + stringProperties["SLURM_NTASKS_PER_SOCKET"] = fromEnv("SLURM_NTASKS_PER_SOCKET"); + stringProperties["SLURM_TASKS_PER_NODE"] = fromEnv("SLURM_TASKS_PER_NODE"); + + stringProperties["buildMachine"] = std::string(WALBERLA_BUILD_MACHINE); + stringProperties["gitVersion"] = std::string(WALBERLA_GIT_SHA1); + stringProperties["buildType"] = std::string(WALBERLA_BUILD_TYPE); + stringProperties["compilerFlags"] = std::string(WALBERLA_COMPILER_FLAGS); + + auto runId = sqlite::storeRunInSqliteDB(databaseFile, integerProperties, stringProperties, realProperties); + sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, timingPool, "TimingRoot"); + sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, *reducedTimingPool, "TimingReduced"); + } + } + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/apps/benchmarks/CommunicationGPU/CommunicationGPU.py b/apps/benchmarks/CommunicationGPU/CommunicationGPU.py new file mode 100644 index 0000000000000000000000000000000000000000..4de22a6ecf86be8a4a2ca841b4064746943f87b3 --- /dev/null +++ b/apps/benchmarks/CommunicationGPU/CommunicationGPU.py @@ -0,0 +1,81 @@ +import os +import waLBerla as wlb + +DB_FILE = os.environ.get('DB_FILE', "CommunicationGPU.sqlite3") + + +class Scenario: + def __init__(self, cells_per_block=(128, 128, 128), gpu_direct_comm=False, layout="fzyx", + warmup_iterations=10, iterations=100, min_iterations=10, max_iterations=100, + time_for_benchmark=1, outer_iterations=1): + self.cells_per_block = cells_per_block + self.blocks_per_process = 1 + + self.database_file = DB_FILE + self.gpu_direct_comm = gpu_direct_comm + self.layout = layout + + self.warmup_iterations = warmup_iterations + self.iterations = iterations + self.min_iterations = min_iterations + self.max_iterations = max_iterations + self.time_for_benchmark = time_for_benchmark + self.outer_iterations = outer_iterations + + @wlb.member_callback + def config(self, print_dict=True): + from pprint import pformat + config_dict = { + 'Domain': { + 'cellsPerBlock': self.cells_per_block, + 'blocksPerProcess': self.blocks_per_process, + }, + 'Parameters': { + 'databaseFile': self.database_file, + 'gpuDirectComm': self.gpu_direct_comm, + 'layout': self.layout, + }, + 'Run': { + 'warmupIterations': self.warmup_iterations, + 'iterations': self.iterations, + 'minIterations': self.min_iterations, + 'maxIterations': self.max_iterations, + 'timeForBenchmark': self.time_for_benchmark, + 'outerIterations': self.outer_iterations, + } + } + + if print_dict: + wlb.log_info_on_root("Scenario:\n" + pformat(config_dict)) + return config_dict + + +# -------------------------------------- Profiling ----------------------------------- +def single_run(): + """Tests different communication overlapping strategies""" + + scenarios = wlb.ScenarioManager() + + cells_per_block = (128, 128, 128) + gpu_direct_comm = False + layout = "fzyx" + warmup_iterations = 10 + iterations = 100 + min_iterations = 10 + max_iterations = 100 + time_for_benchmark = 1 + outer_iterations = 1 + + scenarios.add(Scenario(cells_per_block=cells_per_block, + gpu_direct_comm=gpu_direct_comm, + layout=layout, + warmup_iterations=warmup_iterations, + iterations=iterations, + min_iterations=min_iterations, + max_iterations=max_iterations, + time_for_benchmark=time_for_benchmark, + outer_iterations=outer_iterations)) + + +wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}") +single_run()