Skip to content
Snippets Groups Projects
Commit c57809bb authored by Markus Holzer's avatar Markus Holzer
Browse files

Implemented application for benchmarking communication on GPU

parent 2f69de28
No related branches found
No related tags found
No related merge requests found
Pipeline #53874 failed
......@@ -28,11 +28,14 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
add_subdirectory( NonUniformGridCPU )
endif()
if ( WALBERLA_BUILD_WITH_CODEGEN AND WALBERLA_BUILD_WITH_GPU_SUPPORT )
add_subdirectory( UniformGridGPU )
add_subdirectory( NonUniformGridGPU )
endif()
if(WALBERLA_BUILD_WITH_GPU_SUPPORT)
add_subdirectory(CommunicationGPU)
if ( WALBERLA_BUILD_WITH_CODEGEN )
add_subdirectory( UniformGridGPU )
add_subdirectory( NonUniformGridGPU )
endif()
endif ()
endif()
......
waLBerla_link_files_to_builddir( "*.prm" )
waLBerla_link_files_to_builddir( "*.dat" )
waLBerla_link_files_to_builddir( "*.py" )
waLBerla_add_executable ( NAME CommunicationGPU
DEPENDS blockforest core domain_decomposition field gpu postprocessing sqlite python_coupling )
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file CommunicationGPU.cpp
//! \author Markus Holzer <markus.holzer@fau.de>
//
//======================================================================================================================
#include "blockforest/Initialization.h"
#include "core/Environment.h"
#include "core/logging/Initialization.h"
#include "core/timing/RemainingTimeLogger.h"
#include "core/timing/TimingPool.h"
#include "core/math/all.h"
#include "core/mpi/all.h"
#include "core/waLBerlaBuildInfo.h"
#include "field/AddToStorage.h"
#include "gpu/AddGPUFieldToStorage.h"
#include "gpu/DeviceSelectMPI.h"
#include "gpu/FieldCopy.h"
#include "gpu/GPUField.h"
#include "gpu/GPUWrapper.h"
#include "gpu/HostFieldAllocator.h"
#include "gpu/ParallelStreams.h"
#include "gpu/communication/UniformGPUScheme.h"
#include "gpu/communication/MemcpyPackInfo.h"
#include "python_coupling/CreateConfig.h"
#include "python_coupling/PythonCallback.h"
#include "stencil/D3Q27.h"
#include "sqlite/SQLite.h"
#include <cmath>
using namespace walberla;
using gpu::communication::UniformGPUScheme;
using Field_T = field::GhostLayerField<real_t, 1>;
using GPUField_T = gpu::GPUField<real_t>;
using Stencil_T = stencil::D3Q27;
std::string fromEnv(const char *envVar) {
auto env = std::getenv(envVar);
return env != nullptr ? std::string(env) : "";
}
int main(int argc, char **argv) {
mpi::Environment const env(argc, argv);
gpu::selectDeviceBasedOnMpiRank();
auto mpiManager = mpi::MPIManager::instance();
for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg) {
if (mpiManager->isMPIInitialized())
mpiManager->resetMPI();
WALBERLA_MPI_WORLD_BARRIER()
WALBERLA_GPU_CHECK(gpuPeekAtLastError())
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// SETUP AND CONFIGURATION ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
auto config = *cfg;
logging::configureLogging(config);
// Reading parameters
auto parameters = config->getOneBlock("Parameters");
const std::string databaseFile = parameters.getParameter<std::string>("databaseFile",
"CommunicationGPU.sqlite");
auto gpuDirectComm = parameters.getParameter<bool>("gpuDirectComm", false);
const std::string layoutStr = parameters.getParameter<std::string>("layout", "fzyx");
auto runCfg = config->getOneBlock("Run");
const uint_t warmupIterations = runCfg.getParameter<uint_t>("warmupIterations", 2);
uint_t iterations = runCfg.getParameter<uint_t>("iterations", 10);
const uint_t minIterations = runCfg.getParameter<uint_t>("minIterations", 2);
const uint_t maxIterations = runCfg.getParameter<uint_t>("maxIterations", 100);
const real_t timeForBenchmark = runCfg.getParameter<real_t>("timeForBenchmark", real_t(-1.0));
const uint_t outerIterations = runCfg.getParameter<uint_t>("outerIterations", 2);
field::Layout layout;
if (layoutStr == "fzyx")
layout = field::fzyx;
else if (layoutStr == "zyxf")
layout = field::zyxf;
else {
WALBERLA_ABORT_NO_DEBUG_INFO("Unknown layout string " << layoutStr << ". Valid values are fzyx and zyxf.")
}
auto domainCfg = config->getOneBlock("Domain");
const Vector3<uint_t> cellsPerBlock = domainCfg.getParameter<Vector3<uint_t> >("cellsPerBlock");
uint_t const blocksPerProcess = domainCfg.getParameter<uint_t>("blocksPerProcess", 1);
auto numProcesses = mpiManager->numProcesses();
auto processes = math::getFactors3D(uint_c(numProcesses));
auto blockDecomposition = math::getFactors3D(uint_c(numProcesses) * blocksPerProcess);
auto aabb = AABB(real_t(0), real_t(0), real_t(0),
real_c(cellsPerBlock[0] * processes[0] * blocksPerProcess),
real_c(cellsPerBlock[1] * processes[1] * blocksPerProcess),
real_c(cellsPerBlock[2] * processes[2] * blocksPerProcess));
auto blocks = blockforest::createUniformBlockGrid(aabb,
blockDecomposition[0], blockDecomposition[1],
blockDecomposition[2],
cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2],
processes[0], processes[1], processes[2],
true, true, true, //periodicity
false // keepGlobalBlockInformation
);
auto rank = mpiManager->rank();
auto allocator = make_shared<gpu::HostFieldAllocator<real_t> >(); // use pinned memory allocator for faster CPU-GPU memory transfers
const BlockDataID fieldCPU = field::addToStorage<Field_T>(blocks, "field", real_c(rank), layout, uint_c(1),
allocator);
const BlockDataID fieldGPU = gpu::addGPUFieldToStorage<Field_T>(blocks, fieldCPU, "field GPU", true);
gpu::fieldCpy<GPUField_T, Field_T>(blocks, fieldGPU, fieldCPU);
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// COMMUNICATION SCHEME ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
UniformGPUScheme<Stencil_T> communication(blocks, gpuDirectComm);
auto packInfo = std::make_shared<gpu::communication::MemcpyPackInfo<GPUField_T >>(fieldGPU);
communication.addPackInfo(packInfo);
auto communicate = communication.getCommunicateFunctor();
auto commStart = communication.getStartCommunicateFunctor();
auto commWait = communication.getWaitFunctor();
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TIME STEP DEFINITIONS ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
commStart();
commWait();
WcTimer warmupTimer;
warmupTimer.start();
for (uint_t warmupCounter = 0; warmupCounter < warmupIterations; ++warmupCounter) {
commStart();
commWait();
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
}
warmupTimer.end();
auto estimatedTimePerIteration = warmupTimer.last() / real_c(warmupIterations);
if (timeForBenchmark > 0) {
iterations = uint_c(timeForBenchmark / estimatedTimePerIteration);
if (iterations < minIterations)
iterations = minIterations;
if (iterations > maxIterations)
iterations = maxIterations;
}
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
mpi::broadcastObject(iterations);
WcTimingPool timingPool;
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
WALBERLA_LOG_INFO_ON_ROOT("Running " << outerIterations << " outer iterations of size " << iterations);
for (uint_t outerCtr = 0; outerCtr < outerIterations; ++outerCtr) {
timingPool["totalTime"].start();
for (uint_t ctr = 0; ctr < iterations; ++ctr) {
timingPool["commStart"].start();
commStart();
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
timingPool["commStart"].end();
timingPool["commWait"].start();
commWait();
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
timingPool["commWait"].end();
}
timingPool["totalTime"].end();
}
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
auto reducedTimingPool = timingPool.getReduced(timing::REDUCE_TOTAL, 0);
WALBERLA_ROOT_SECTION() {
WALBERLA_LOG_RESULT(*reducedTimingPool);
std::map<std::string, walberla::int64_t> integerProperties;
std::map<std::string, double> realProperties;
std::map<std::string, std::string> stringProperties;
auto databaseBlock = config->getBlock("Database");
if (databaseBlock) {
for (auto it = databaseBlock.begin(); it != databaseBlock.end(); ++it)
stringProperties[it->first] = it->second;
}
realProperties["total_min"] = real_c(timingPool["totalTime"].min()) / real_c(iterations);
realProperties["total_avg"] = real_c(timingPool["totalTime"].average() / real_c(iterations));
realProperties["total_max"] = real_c(timingPool["totalTime"].max() / real_c(iterations));
integerProperties["cellsPerBlock0"] = int64_c(cellsPerBlock[0]);
integerProperties["cellsPerBlock1"] = int64_c(cellsPerBlock[1]);
integerProperties["cellsPerBlock2"] = int64_c(cellsPerBlock[2]);
integerProperties["processes0"] = int64_c(processes[0]);
integerProperties["processes1"] = int64_c(processes[1]);
integerProperties["processes2"] = int64_c(processes[2]);
integerProperties["blocks0"] = int64_c(blockDecomposition[0]);
integerProperties["blocks1"] = int64_c(blockDecomposition[1]);
integerProperties["blocks2"] = int64_c(blockDecomposition[2]);
integerProperties["blocksPerProcess"] = int64_c(blocksPerProcess);
integerProperties["cartesianCommunicator"] = mpiManager->hasCartesianSetup();
integerProperties["warmupIterations"] = int64_c(warmupIterations);
integerProperties["iterations"] = int64_c(iterations);
integerProperties["outerIterations"] = int64_c(outerIterations);
stringProperties["layout"] = layoutStr;
stringProperties["SLURM_CLUSTER_NAME"] = fromEnv("SLURM_CLUSTER_NAME");
stringProperties["SLURM_CPUS_ON_NODE"] = fromEnv("SLURM_CPUS_ON_NODE");
stringProperties["SLURM_CPUS_PER_TASK"] = fromEnv("SLURM_CPUS_PER_TASK");
stringProperties["SLURM_JOB_ACCOUNT"] = fromEnv("SLURM_JOB_ACCOUNT");
stringProperties["SLURM_JOB_ID"] = fromEnv("SLURM_JOB_ID");
stringProperties["SLURM_JOB_CPUS_PER_NODE"] = fromEnv("SLURM_JOB_CPUS_PER_NODE");
stringProperties["SLURM_JOB_NAME"] = fromEnv("SLURM_JOB_NAME");
stringProperties["SLURM_JOB_NUM_NODES"] = fromEnv("SLURM_JOB_NUM_NODES");
stringProperties["SLURM_NTASKS"] = fromEnv("SLURM_NTASKS");
stringProperties["SLURM_NTASKS_PER_CORE"] = fromEnv("SLURM_NTASKS_PER_CORE");
stringProperties["SLURM_NTASKS_PER_NODE"] = fromEnv("SLURM_NTASKS_PER_NODE");
stringProperties["SLURM_NTASKS_PER_SOCKET"] = fromEnv("SLURM_NTASKS_PER_SOCKET");
stringProperties["SLURM_TASKS_PER_NODE"] = fromEnv("SLURM_TASKS_PER_NODE");
stringProperties["buildMachine"] = std::string(WALBERLA_BUILD_MACHINE);
stringProperties["gitVersion"] = std::string(WALBERLA_GIT_SHA1);
stringProperties["buildType"] = std::string(WALBERLA_BUILD_TYPE);
stringProperties["compilerFlags"] = std::string(WALBERLA_COMPILER_FLAGS);
auto runId = sqlite::storeRunInSqliteDB(databaseFile, integerProperties, stringProperties, realProperties);
sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, timingPool, "TimingRoot");
sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, *reducedTimingPool, "TimingReduced");
}
}
return EXIT_SUCCESS;
}
\ No newline at end of file
import os
import waLBerla as wlb
DB_FILE = os.environ.get('DB_FILE', "CommunicationGPU.sqlite3")
class Scenario:
def __init__(self, cells_per_block=(128, 128, 128), gpu_direct_comm=False, layout="fzyx",
warmup_iterations=10, iterations=100, min_iterations=10, max_iterations=100,
time_for_benchmark=1, outer_iterations=1):
self.cells_per_block = cells_per_block
self.blocks_per_process = 1
self.database_file = DB_FILE
self.gpu_direct_comm = gpu_direct_comm
self.layout = layout
self.warmup_iterations = warmup_iterations
self.iterations = iterations
self.min_iterations = min_iterations
self.max_iterations = max_iterations
self.time_for_benchmark = time_for_benchmark
self.outer_iterations = outer_iterations
@wlb.member_callback
def config(self, print_dict=True):
from pprint import pformat
config_dict = {
'Domain': {
'cellsPerBlock': self.cells_per_block,
'blocksPerProcess': self.blocks_per_process,
},
'Parameters': {
'databaseFile': self.database_file,
'gpuDirectComm': self.gpu_direct_comm,
'layout': self.layout,
},
'Run': {
'warmupIterations': self.warmup_iterations,
'iterations': self.iterations,
'minIterations': self.min_iterations,
'maxIterations': self.max_iterations,
'timeForBenchmark': self.time_for_benchmark,
'outerIterations': self.outer_iterations,
}
}
if print_dict:
wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
return config_dict
# -------------------------------------- Profiling -----------------------------------
def single_run():
"""Tests different communication overlapping strategies"""
scenarios = wlb.ScenarioManager()
cells_per_block = (128, 128, 128)
gpu_direct_comm = False
layout = "fzyx"
warmup_iterations = 10
iterations = 100
min_iterations = 10
max_iterations = 100
time_for_benchmark = 1
outer_iterations = 1
scenarios.add(Scenario(cells_per_block=cells_per_block,
gpu_direct_comm=gpu_direct_comm,
layout=layout,
warmup_iterations=warmup_iterations,
iterations=iterations,
min_iterations=min_iterations,
max_iterations=max_iterations,
time_for_benchmark=time_for_benchmark,
outer_iterations=outer_iterations))
wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
single_run()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment