Skip to content
Snippets Groups Projects
Commit 7798ceb2 authored by Markus Holzer's avatar Markus Holzer
Browse files

Added more tests

parent c57809bb
No related branches found
No related tags found
No related merge requests found
Pipeline #53904 failed
Showing
with 608 additions and 198 deletions
...@@ -29,9 +29,8 @@ if ( WALBERLA_BUILD_WITH_PYTHON ) ...@@ -29,9 +29,8 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
endif() endif()
if(WALBERLA_BUILD_WITH_GPU_SUPPORT) if(WALBERLA_BUILD_WITH_GPU_SUPPORT)
add_subdirectory(CommunicationGPU)
if ( WALBERLA_BUILD_WITH_CODEGEN ) if ( WALBERLA_BUILD_WITH_CODEGEN )
add_subdirectory( CommunicationGPU )
add_subdirectory( UniformGridGPU ) add_subdirectory( UniformGridGPU )
add_subdirectory( NonUniformGridGPU ) add_subdirectory( NonUniformGridGPU )
endif() endif()
......
...@@ -3,5 +3,18 @@ waLBerla_link_files_to_builddir( "*.dat" ) ...@@ -3,5 +3,18 @@ waLBerla_link_files_to_builddir( "*.dat" )
waLBerla_link_files_to_builddir( "*.py" ) waLBerla_link_files_to_builddir( "*.py" )
waLBerla_add_executable ( NAME CommunicationGPU waLBerla_generate_target_from_python(NAME CommunicationGPUGenerated
DEPENDS blockforest core domain_decomposition field gpu postprocessing sqlite python_coupling ) FILE CommunicationGPUCodeGen.py
OUT_FILES StorageSpec.h StorageSpec.${CODEGEN_FILE_SUFFIX}
CommunicationGPUInfoHeader.h
)
waLBerla_add_executable (
NAME CommunicationGPU
FILES CommunicationGPU.cpp
DEPENDS blockforest core domain_decomposition field gpu postprocessing sqlite python_coupling CommunicationGPUGenerated )
waLBerla_add_executable (
NAME GPUPackPerformance
FILES GPUPackPerformance.cpp
DEPENDS blockforest core domain_decomposition field gpu postprocessing sqlite python_coupling CommunicationGPUGenerated )
...@@ -28,33 +28,34 @@ ...@@ -28,33 +28,34 @@
#include "core/mpi/all.h" #include "core/mpi/all.h"
#include "core/waLBerlaBuildInfo.h" #include "core/waLBerlaBuildInfo.h"
#include "field/AddToStorage.h"
#include "gpu/AddGPUFieldToStorage.h"
#include "gpu/DeviceSelectMPI.h" #include "gpu/DeviceSelectMPI.h"
#include "gpu/FieldCopy.h" #include "gpu/FieldCopy.h"
#include "gpu/GPUField.h"
#include "gpu/GPUWrapper.h" #include "gpu/GPUWrapper.h"
#include "gpu/HostFieldAllocator.h" #include "gpu/HostFieldAllocator.h"
#include "gpu/ParallelStreams.h" #include "gpu/ParallelStreams.h"
#include "gpu/communication/UniformGPUScheme.h" #include "gpu/communication/UniformGPUScheme.h"
#include "gpu/communication/MemcpyPackInfo.h" #include "gpu/communication/MemcpyPackInfo.h"
#include "lbm_generated/field/AddToStorage.h"
#include "lbm_generated/field/PdfField.h"
#include "lbm_generated/gpu/AddToStorage.h"
#include "lbm_generated/gpu/GPUPdfField.h"
#include "lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h"
#include "python_coupling/CreateConfig.h" #include "python_coupling/CreateConfig.h"
#include "python_coupling/PythonCallback.h" #include "python_coupling/PythonCallback.h"
#include "stencil/D3Q27.h"
#include "sqlite/SQLite.h" #include "sqlite/SQLite.h"
#include <cmath> #include "CommunicationGPUInfoHeader.h"
using namespace walberla; using namespace walberla;
using gpu::communication::UniformGPUScheme; using gpu::communication::UniformGPUScheme;
using Field_T = field::GhostLayerField<real_t, 1>; using StorageSpecification_T = lbm::StorageSpec;
using GPUField_T = gpu::GPUField<real_t>; using Stencil_T = StorageSpecification_T::Stencil;
using Stencil_T = stencil::D3Q27; using Field_T = lbm_generated::PdfField< StorageSpecification_T >;
using GPUField_T = lbm_generated::GPUPdfField< StorageSpecification_T >;
std::string fromEnv(const char *envVar) { std::string fromEnv(const char *envVar) {
auto env = std::getenv(envVar); auto env = std::getenv(envVar);
...@@ -95,6 +96,7 @@ int main(int argc, char **argv) { ...@@ -95,6 +96,7 @@ int main(int argc, char **argv) {
const real_t timeForBenchmark = runCfg.getParameter<real_t>("timeForBenchmark", real_t(-1.0)); const real_t timeForBenchmark = runCfg.getParameter<real_t>("timeForBenchmark", real_t(-1.0));
const uint_t outerIterations = runCfg.getParameter<uint_t>("outerIterations", 2); const uint_t outerIterations = runCfg.getParameter<uint_t>("outerIterations", 2);
const std::string packInfoName = runCfg.getParameter<std::string>("pachinfo", "UniformGeneratedGPUPdfPackInfo");
field::Layout layout; field::Layout layout;
if (layoutStr == "fzyx") if (layoutStr == "fzyx")
...@@ -127,12 +129,10 @@ int main(int argc, char **argv) { ...@@ -127,12 +129,10 @@ int main(int argc, char **argv) {
false // keepGlobalBlockInformation false // keepGlobalBlockInformation
); );
auto rank = mpiManager->rank(); const StorageSpecification_T StorageSpec = StorageSpecification_T();
auto allocator = make_shared<gpu::HostFieldAllocator<real_t> >(); // use pinned memory allocator for faster CPU-GPU memory transfers auto allocator = make_shared<gpu::HostFieldAllocator<real_t> >(); // use pinned memory allocator for faster CPU-GPU memory transfers
const BlockDataID fieldCPU = field::addToStorage<Field_T>(blocks, "field", real_c(rank), layout, uint_c(1), const BlockDataID fieldCPU = lbm_generated::addPdfFieldToStorage(blocks, "field", StorageSpec, uint_c(1), layout, allocator);
allocator); const BlockDataID fieldGPU = lbm_generated::addGPUPdfFieldToStorage< Field_T >(blocks, fieldCPU, StorageSpec, "field GPU", true);
const BlockDataID fieldGPU = gpu::addGPUFieldToStorage<Field_T>(blocks, fieldCPU, "field GPU", true);
gpu::fieldCpy<GPUField_T, Field_T>(blocks, fieldGPU, fieldCPU); gpu::fieldCpy<GPUField_T, Field_T>(blocks, fieldGPU, fieldCPU);
...@@ -141,14 +141,40 @@ int main(int argc, char **argv) { ...@@ -141,14 +141,40 @@ int main(int argc, char **argv) {
/// COMMUNICATION SCHEME /// /// COMMUNICATION SCHEME ///
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::shared_ptr<gpu::GeneratedGPUPackInfo> packInfo;
if(packInfoName == "MemcpyPackInfo")
{
packInfo = std::make_shared<gpu::communication::MemcpyPackInfo<GPUField_T >>(fieldGPU);
}
else if (packInfoName == "UniformGeneratedGPUPdfPackInfo")
{
packInfo = std::make_shared<lbm_generated::UniformGeneratedGPUPdfPackInfo< GPUField_T >>(fieldGPU);
}
else
{
WALBERLA_ABORT("packInfoName does not exist")
}
UniformGPUScheme<Stencil_T> communication(blocks, gpuDirectComm); UniformGPUScheme<Stencil_T> communication(blocks, gpuDirectComm);
auto packInfo = std::make_shared<gpu::communication::MemcpyPackInfo<GPUField_T >>(fieldGPU);
communication.addPackInfo(packInfo); communication.addPackInfo(packInfo);
auto communicate = communication.getCommunicateFunctor();
auto commStart = communication.getStartCommunicateFunctor(); auto commStart = communication.getStartCommunicateFunctor();
auto commWait = communication.getWaitFunctor(); auto commWait = communication.getWaitFunctor();
WALBERLA_ROOT_SECTION()
{
for (auto &block : *blocks)
{
for( auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir )
{
WALBERLA_LOG_INFO("For direction: " << stencil::dirToString[*dir] << " " << packInfo->size(*dir, &block) << " bytes will be communicated")
}
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TIME STEP DEFINITIONS /// /// TIME STEP DEFINITIONS ///
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
...@@ -178,30 +204,30 @@ int main(int argc, char **argv) { ...@@ -178,30 +204,30 @@ int main(int argc, char **argv) {
mpi::broadcastObject(iterations); mpi::broadcastObject(iterations);
WcTimingPool timingPool; auto timingPool = std::make_shared<WcTimingPool>();
communication.enableTiming(timingPool);
WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
WALBERLA_LOG_INFO_ON_ROOT("Running " << outerIterations << " outer iterations of size " << iterations); WALBERLA_LOG_INFO_ON_ROOT("Running " << outerIterations << " outer iterations of size " << iterations)
for (uint_t outerCtr = 0; outerCtr < outerIterations; ++outerCtr) { for (uint_t outerCtr = 0; outerCtr < outerIterations; ++outerCtr) {
timingPool["totalTime"].start();
for (uint_t ctr = 0; ctr < iterations; ++ctr) { for (uint_t ctr = 0; ctr < iterations; ++ctr) {
timingPool["commStart"].start(); timingPool->operator[]("commStart").start();
commStart(); commStart();
WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
timingPool["commStart"].end(); timingPool->operator[]("commStart").end();
timingPool["commWait"].start(); timingPool->operator[]("commWait").start();
commWait(); commWait();
WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
timingPool["commWait"].end(); timingPool->operator[]("commWait").end();
} }
timingPool["totalTime"].end();
} }
WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
auto reducedTimingPool = timingPool.getReduced(timing::REDUCE_TOTAL, 0); auto reducedTimingPool = timingPool->getReduced(timing::REDUCE_TOTAL, 0);
WALBERLA_ROOT_SECTION() { WALBERLA_ROOT_SECTION() {
WALBERLA_LOG_RESULT(*reducedTimingPool); WALBERLA_LOG_RESULT(*reducedTimingPool)
std::map<std::string, walberla::int64_t> integerProperties; std::map<std::string, walberla::int64_t> integerProperties;
std::map<std::string, double> realProperties; std::map<std::string, double> realProperties;
...@@ -213,9 +239,8 @@ int main(int argc, char **argv) { ...@@ -213,9 +239,8 @@ int main(int argc, char **argv) {
stringProperties[it->first] = it->second; stringProperties[it->first] = it->second;
} }
realProperties["total_min"] = real_c(timingPool["totalTime"].min()) / real_c(iterations); realProperties["total_min"] = real_c(timingPool->operator[]("totalTime").min()) / real_c(iterations);
realProperties["total_avg"] = real_c(timingPool["totalTime"].average() / real_c(iterations)); realProperties["total_max"] = real_c(timingPool->operator[]("totalTime").max() / real_c(iterations));
realProperties["total_max"] = real_c(timingPool["totalTime"].max() / real_c(iterations));
integerProperties["cellsPerBlock0"] = int64_c(cellsPerBlock[0]); integerProperties["cellsPerBlock0"] = int64_c(cellsPerBlock[0]);
integerProperties["cellsPerBlock1"] = int64_c(cellsPerBlock[1]); integerProperties["cellsPerBlock1"] = int64_c(cellsPerBlock[1]);
...@@ -237,6 +262,7 @@ int main(int argc, char **argv) { ...@@ -237,6 +262,7 @@ int main(int argc, char **argv) {
integerProperties["outerIterations"] = int64_c(outerIterations); integerProperties["outerIterations"] = int64_c(outerIterations);
stringProperties["layout"] = layoutStr; stringProperties["layout"] = layoutStr;
stringProperties["packInfoName"] = packInfoName;
stringProperties["SLURM_CLUSTER_NAME"] = fromEnv("SLURM_CLUSTER_NAME"); stringProperties["SLURM_CLUSTER_NAME"] = fromEnv("SLURM_CLUSTER_NAME");
stringProperties["SLURM_CPUS_ON_NODE"] = fromEnv("SLURM_CPUS_ON_NODE"); stringProperties["SLURM_CPUS_ON_NODE"] = fromEnv("SLURM_CPUS_ON_NODE");
...@@ -258,7 +284,7 @@ int main(int argc, char **argv) { ...@@ -258,7 +284,7 @@ int main(int argc, char **argv) {
stringProperties["compilerFlags"] = std::string(WALBERLA_COMPILER_FLAGS); stringProperties["compilerFlags"] = std::string(WALBERLA_COMPILER_FLAGS);
auto runId = sqlite::storeRunInSqliteDB(databaseFile, integerProperties, stringProperties, realProperties); auto runId = sqlite::storeRunInSqliteDB(databaseFile, integerProperties, stringProperties, realProperties);
sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, timingPool, "TimingRoot"); sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, *timingPool, "TimingRoot");
sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, *reducedTimingPool, "TimingReduced"); sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, *reducedTimingPool, "TimingReduced");
} }
} }
......
...@@ -2,12 +2,13 @@ import os ...@@ -2,12 +2,13 @@ import os
import waLBerla as wlb import waLBerla as wlb
DB_FILE = os.environ.get('DB_FILE', "CommunicationGPU.sqlite3") DB_FILE = os.environ.get('DB_FILE', "CommunicationGPU.sqlite3")
LOGLEVEL = 'info'
class Scenario: class Scenario:
def __init__(self, cells_per_block=(128, 128, 128), gpu_direct_comm=False, layout="fzyx", def __init__(self, cells_per_block=(128, 128, 128), gpu_direct_comm=False, layout="fzyx",
warmup_iterations=10, iterations=100, min_iterations=10, max_iterations=100, warmup_iterations=10, iterations=100, min_iterations=10, max_iterations=100,
time_for_benchmark=1, outer_iterations=1): time_for_benchmark=1, outer_iterations=1, packinfo="MemcpyPackInfo"):
self.cells_per_block = cells_per_block self.cells_per_block = cells_per_block
self.blocks_per_process = 1 self.blocks_per_process = 1
...@@ -22,6 +23,8 @@ class Scenario: ...@@ -22,6 +23,8 @@ class Scenario:
self.time_for_benchmark = time_for_benchmark self.time_for_benchmark = time_for_benchmark
self.outer_iterations = outer_iterations self.outer_iterations = outer_iterations
self.packinfo = packinfo
@wlb.member_callback @wlb.member_callback
def config(self, print_dict=True): def config(self, print_dict=True):
from pprint import pformat from pprint import pformat
...@@ -42,6 +45,10 @@ class Scenario: ...@@ -42,6 +45,10 @@ class Scenario:
'maxIterations': self.max_iterations, 'maxIterations': self.max_iterations,
'timeForBenchmark': self.time_for_benchmark, 'timeForBenchmark': self.time_for_benchmark,
'outerIterations': self.outer_iterations, 'outerIterations': self.outer_iterations,
'pachinfo': self.packinfo
},
'Logging': {
'logLevel': LOGLEVEL
} }
} }
...@@ -66,15 +73,18 @@ def single_run(): ...@@ -66,15 +73,18 @@ def single_run():
time_for_benchmark = 1 time_for_benchmark = 1
outer_iterations = 1 outer_iterations = 1
scenarios.add(Scenario(cells_per_block=cells_per_block, for pack_info_name in ["MemcpyPackInfo", "UniformGeneratedGPUPdfPackInfo"]:
gpu_direct_comm=gpu_direct_comm,
layout=layout, scenarios.add(Scenario(cells_per_block=cells_per_block,
warmup_iterations=warmup_iterations, gpu_direct_comm=gpu_direct_comm,
iterations=iterations, layout=layout,
min_iterations=min_iterations, warmup_iterations=warmup_iterations,
max_iterations=max_iterations, iterations=iterations,
time_for_benchmark=time_for_benchmark, min_iterations=min_iterations,
outer_iterations=outer_iterations)) max_iterations=max_iterations,
time_for_benchmark=time_for_benchmark,
outer_iterations=outer_iterations,
packinfo=pack_info_name))
wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}") wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
......
from pystencils import Target
from lbmpy import Stencil, LBStencil, LBMConfig, create_lb_method
from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
from lbmpy_walberla import generate_lbm_storage_specification
with CodeGeneration() as ctx:
stencil = LBStencil(Stencil.D3Q27)
streaming_pattern = 'pull'
nonuniform = False
target = Target.GPU
data_type = "float64"
cpu_openmp = False
lbm_config = LBMConfig(stencil=stencil)
method = create_lb_method(lbm_config=lbm_config)
generate_lbm_storage_specification(ctx, "StorageSpec", method, lbm_config,
nonuniform=nonuniform, target=target, data_type=data_type)
generate_info_header(ctx, 'CommunicationGPUInfoHeader')
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file CommunicationGPU.cpp
//! \author Markus Holzer <markus.holzer@fau.de>
//
//======================================================================================================================
#include "blockforest/Initialization.h"
#include "core/Environment.h"
#include "core/logging/Initialization.h"
#include "core/timing/RemainingTimeLogger.h"
#include "core/timing/TimingPool.h"
#include "core/math/all.h"
#include "core/mpi/all.h"
#include "gpu/DeviceSelectMPI.h"
#include "gpu/FieldCopy.h"
#include "gpu/GPUWrapper.h"
#include "gpu/HostFieldAllocator.h"
#include "gpu/ParallelStreams.h"
#include "gpu/communication/UniformGPUScheme.h"
#include "gpu/communication/MemcpyPackInfo.h"
#include "lbm_generated/field/AddToStorage.h"
#include "lbm_generated/field/PdfField.h"
#include "lbm_generated/gpu/AddToStorage.h"
#include "lbm_generated/gpu/GPUPdfField.h"
#include "lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h"
#include "python_coupling/CreateConfig.h"
#include "python_coupling/PythonCallback.h"
#include "sqlite/SQLite.h"
#include "CommunicationGPUInfoHeader.h"
#include <cmath>
using namespace walberla;
using gpu::communication::UniformGPUScheme;
using StorageSpecification_T = lbm::StorageSpec;
using Stencil_T = StorageSpecification_T::Stencil;
using Field_T = lbm_generated::PdfField< StorageSpecification_T >;
using GPUField_T = lbm_generated::GPUPdfField< StorageSpecification_T >;
int main(int argc, char **argv) {
mpi::Environment const env(argc, argv);
gpu::selectDeviceBasedOnMpiRank();
auto mpiManager = mpi::MPIManager::instance();
for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg) {
if (mpiManager->isMPIInitialized())
mpiManager->resetMPI();
WALBERLA_MPI_WORLD_BARRIER()
WALBERLA_GPU_CHECK(gpuPeekAtLastError())
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// SETUP AND CONFIGURATION ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
auto config = *cfg;
logging::configureLogging(config);
// Reading parameters
auto parameters = config->getOneBlock("Parameters");
const std::string databaseFile = parameters.getParameter<std::string>("databaseFile",
"CommunicationGPU.sqlite");
auto gpuDirectComm = parameters.getParameter<bool>("gpuDirectComm", false);
const std::string layoutStr = parameters.getParameter<std::string>("layout", "fzyx");
auto runCfg = config->getOneBlock("Run");
const std::string packInfoName = runCfg.getParameter<std::string>("pachinfo", "UniformGeneratedGPUPdfPackInfo");
// const uint_t warmupIterations = runCfg.getParameter<uint_t>("warmupIterations", 2);
// uint_t iterations = runCfg.getParameter<uint_t>("iterations", 10);
// const uint_t minIterations = runCfg.getParameter<uint_t>("minIterations", 2);
// const uint_t maxIterations = runCfg.getParameter<uint_t>("maxIterations", 100);
//
// const real_t timeForBenchmark = runCfg.getParameter<real_t>("timeForBenchmark", real_t(-1.0));
// const uint_t outerIterations = runCfg.getParameter<uint_t>("outerIterations", 2);
field::Layout layout;
if (layoutStr == "fzyx")
layout = field::fzyx;
else if (layoutStr == "zyxf")
layout = field::zyxf;
else {
WALBERLA_ABORT_NO_DEBUG_INFO("Unknown layout string " << layoutStr << ". Valid values are fzyx and zyxf.")
}
auto domainCfg = config->getOneBlock("Domain");
const Vector3<uint_t> cellsPerBlock = domainCfg.getParameter<Vector3<uint_t> >("cellsPerBlock");
uint_t const blocksPerProcess = domainCfg.getParameter<uint_t>("blocksPerProcess", 1);
auto numProcesses = mpiManager->numProcesses();
auto processes = math::getFactors3D(uint_c(numProcesses));
auto blockDecomposition = math::getFactors3D(uint_c(numProcesses) * blocksPerProcess);
auto aabb = AABB(real_t(0), real_t(0), real_t(0),
real_c(cellsPerBlock[0] * processes[0] * blocksPerProcess),
real_c(cellsPerBlock[1] * processes[1] * blocksPerProcess),
real_c(cellsPerBlock[2] * processes[2] * blocksPerProcess));
auto blocks = blockforest::createUniformBlockGrid(aabb,
blockDecomposition[0], blockDecomposition[1],
blockDecomposition[2],
cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2],
processes[0], processes[1], processes[2],
true, true, true, //periodicity
false // keepGlobalBlockInformation
);
const StorageSpecification_T StorageSpec = StorageSpecification_T();
auto allocator = make_shared<gpu::HostFieldAllocator<real_t> >(); // use pinned memory allocator for faster CPU-GPU memory transfers
const BlockDataID fieldCPU = lbm_generated::addPdfFieldToStorage(blocks, "field", StorageSpec, uint_c(1), layout, allocator);
const BlockDataID fieldGPU = lbm_generated::addGPUPdfFieldToStorage< Field_T >(blocks, fieldCPU, StorageSpec, "field GPU", true);
gpu::fieldCpy<GPUField_T, Field_T>(blocks, fieldGPU, fieldCPU);
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// COMMUNICATION SCHEME ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::shared_ptr<gpu::GeneratedGPUPackInfo> packInfo;
if(packInfoName == "MemcpyPackInfo")
{
packInfo = std::make_shared<gpu::communication::MemcpyPackInfo<GPUField_T >>(fieldGPU);
}
else if (packInfoName == "UniformGeneratedGPUPdfPackInfo")
{
packInfo = std::make_shared<lbm_generated::UniformGeneratedGPUPdfPackInfo< GPUField_T >>(fieldGPU);
}
else
{
WALBERLA_ABORT("packInfoName does not exist")
}
// WcTimer timer;
uint_t packSize = 0;
for (auto &block : *blocks) {
for (auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir)
{
packSize += packInfo->size(*dir, &block);
WALBERLA_LOG_INFO_ON_ROOT("For direction: " << stencil::dirToString[*dir] << " " << packInfo->size(*dir, &block)
<< " bytes will be communicated")
}
}
unsigned char *buffer;
WALBERLA_GPU_CHECK( gpuMalloc( &buffer, packSize))
unsigned char *beginPtr = buffer;
unsigned char *currentPtr = buffer;
for (auto &iBlock : *blocks)
{
for (auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir)
{
auto senderBlock = dynamic_cast< Block * >( &iBlock );
packInfo->pack(*dir, currentPtr, senderBlock, nullptr);
currentPtr += packInfo->size(*dir, senderBlock);
}
}
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
WALBERLA_CHECK_NOT_NULLPTR(currentPtr)
WALBERLA_CHECK_NOT_NULLPTR(beginPtr)
currentPtr = beginPtr;
for (auto &iBlock : *blocks)
{
for (auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir)
{
auto senderBlock = dynamic_cast< Block * >( &iBlock );
packInfo->unpack(*dir, currentPtr, senderBlock, nullptr);
currentPtr += packInfo->size(*dir, senderBlock);
}
}
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TIME STEP DEFINITIONS ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// commStart();
// commWait();
//
// WcTimer warmupTimer;
// warmupTimer.start();
// for (uint_t warmupCounter = 0; warmupCounter < warmupIterations; ++warmupCounter) {
// commStart();
// commWait();
// WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
// }
// warmupTimer.end();
//
//
// auto estimatedTimePerIteration = warmupTimer.last() / real_c(warmupIterations);
// if (timeForBenchmark > 0) {
// iterations = uint_c(timeForBenchmark / estimatedTimePerIteration);
// if (iterations < minIterations)
// iterations = minIterations;
// if (iterations > maxIterations)
// iterations = maxIterations;
// }
// WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
//
// mpi::broadcastObject(iterations);
//
// auto timingPool = std::make_shared<WcTimingPool>();
// communication.enableTiming(timingPool);
// WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
// WALBERLA_LOG_INFO_ON_ROOT("Running " << outerIterations << " outer iterations of size " << iterations)
//
// for (uint_t outerCtr = 0; outerCtr < outerIterations; ++outerCtr) {
// for (uint_t ctr = 0; ctr < iterations; ++ctr) {
// timingPool->operator[]("commStart").start();
// commStart();
// WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
// timingPool->operator[]("commStart").end();
//
// timingPool->operator[]("commWait").start();
// commWait();
// WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
// timingPool->operator[]("commWait").end();
// }
// }
//
// WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
//
// auto reducedTimingPool = timingPool->getReduced(timing::REDUCE_TOTAL, 0);
// WALBERLA_ROOT_SECTION() {
// WALBERLA_LOG_RESULT(*reducedTimingPool)
//
// std::map<std::string, walberla::int64_t> integerProperties;
// std::map<std::string, double> realProperties;
// std::map<std::string, std::string> stringProperties;
//
// auto databaseBlock = config->getBlock("Database");
// if (databaseBlock) {
// for (auto it = databaseBlock.begin(); it != databaseBlock.end(); ++it)
// stringProperties[it->first] = it->second;
// }
//
// realProperties["total_min"] = real_c(timingPool->operator[]("totalTime").min()) / real_c(iterations);
// realProperties["total_max"] = real_c(timingPool->operator[]("totalTime").max() / real_c(iterations));
//
// integerProperties["cellsPerBlock0"] = int64_c(cellsPerBlock[0]);
// integerProperties["cellsPerBlock1"] = int64_c(cellsPerBlock[1]);
// integerProperties["cellsPerBlock2"] = int64_c(cellsPerBlock[2]);
//
// integerProperties["processes0"] = int64_c(processes[0]);
// integerProperties["processes1"] = int64_c(processes[1]);
// integerProperties["processes2"] = int64_c(processes[2]);
//
// integerProperties["blocks0"] = int64_c(blockDecomposition[0]);
// integerProperties["blocks1"] = int64_c(blockDecomposition[1]);
// integerProperties["blocks2"] = int64_c(blockDecomposition[2]);
//
// integerProperties["blocksPerProcess"] = int64_c(blocksPerProcess);
// integerProperties["cartesianCommunicator"] = mpiManager->hasCartesianSetup();
//
// integerProperties["warmupIterations"] = int64_c(warmupIterations);
// integerProperties["iterations"] = int64_c(iterations);
// integerProperties["outerIterations"] = int64_c(outerIterations);
//
// stringProperties["layout"] = layoutStr;
//
// stringProperties["SLURM_CLUSTER_NAME"] = fromEnv("SLURM_CLUSTER_NAME");
// stringProperties["SLURM_CPUS_ON_NODE"] = fromEnv("SLURM_CPUS_ON_NODE");
// stringProperties["SLURM_CPUS_PER_TASK"] = fromEnv("SLURM_CPUS_PER_TASK");
// stringProperties["SLURM_JOB_ACCOUNT"] = fromEnv("SLURM_JOB_ACCOUNT");
// stringProperties["SLURM_JOB_ID"] = fromEnv("SLURM_JOB_ID");
// stringProperties["SLURM_JOB_CPUS_PER_NODE"] = fromEnv("SLURM_JOB_CPUS_PER_NODE");
// stringProperties["SLURM_JOB_NAME"] = fromEnv("SLURM_JOB_NAME");
// stringProperties["SLURM_JOB_NUM_NODES"] = fromEnv("SLURM_JOB_NUM_NODES");
// stringProperties["SLURM_NTASKS"] = fromEnv("SLURM_NTASKS");
// stringProperties["SLURM_NTASKS_PER_CORE"] = fromEnv("SLURM_NTASKS_PER_CORE");
// stringProperties["SLURM_NTASKS_PER_NODE"] = fromEnv("SLURM_NTASKS_PER_NODE");
// stringProperties["SLURM_NTASKS_PER_SOCKET"] = fromEnv("SLURM_NTASKS_PER_SOCKET");
// stringProperties["SLURM_TASKS_PER_NODE"] = fromEnv("SLURM_TASKS_PER_NODE");
//
// stringProperties["buildMachine"] = std::string(WALBERLA_BUILD_MACHINE);
// stringProperties["gitVersion"] = std::string(WALBERLA_GIT_SHA1);
// stringProperties["buildType"] = std::string(WALBERLA_BUILD_TYPE);
// stringProperties["compilerFlags"] = std::string(WALBERLA_COMPILER_FLAGS);
//
// auto runId = sqlite::storeRunInSqliteDB(databaseFile, integerProperties, stringProperties, realProperties);
// sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, *timingPool, "TimingRoot");
// sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, *reducedTimingPool, "TimingReduced");
// }
}
return EXIT_SUCCESS;
}
\ No newline at end of file
import os
import waLBerla as wlb
DB_FILE = os.environ.get('DB_FILE', "CommunicationGPU.sqlite3")
LOGLEVEL = 'info'
class Scenario:
def __init__(self, cells_per_block=(128, 128, 128), gpu_direct_comm=False, layout="fzyx",
warmup_iterations=10, iterations=100, min_iterations=10, max_iterations=100,
time_for_benchmark=1, outer_iterations=1, packinfo="MemcpyPackInfo"):
self.cells_per_block = cells_per_block
self.blocks_per_process = 1
self.database_file = DB_FILE
self.gpu_direct_comm = gpu_direct_comm
self.layout = layout
self.warmup_iterations = warmup_iterations
self.iterations = iterations
self.min_iterations = min_iterations
self.max_iterations = max_iterations
self.time_for_benchmark = time_for_benchmark
self.outer_iterations = outer_iterations
self.packinfo = packinfo
@wlb.member_callback
def config(self, print_dict=True):
from pprint import pformat
config_dict = {
'Domain': {
'cellsPerBlock': self.cells_per_block,
'blocksPerProcess': self.blocks_per_process,
},
'Parameters': {
'databaseFile': self.database_file,
'gpuDirectComm': self.gpu_direct_comm,
'layout': self.layout,
},
'Run': {
'warmupIterations': self.warmup_iterations,
'iterations': self.iterations,
'minIterations': self.min_iterations,
'maxIterations': self.max_iterations,
'timeForBenchmark': self.time_for_benchmark,
'outerIterations': self.outer_iterations,
'pachinfo': self.packinfo
},
'Logging': {
'logLevel': LOGLEVEL
}
}
if print_dict:
wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
return config_dict
# -------------------------------------- Profiling -----------------------------------
def single_run():
"""Tests different communication overlapping strategies"""
scenarios = wlb.ScenarioManager()
cells_per_block = (128, 128, 128)
gpu_direct_comm = False
layout = "fzyx"
warmup_iterations = 10
iterations = 100
min_iterations = 10
max_iterations = 100
time_for_benchmark = 1
outer_iterations = 1
for pack_info_name in ["MemcpyPackInfo", "UniformGeneratedGPUPdfPackInfo"]:
scenarios.add(Scenario(cells_per_block=cells_per_block,
gpu_direct_comm=gpu_direct_comm,
layout=layout,
warmup_iterations=warmup_iterations,
iterations=iterations,
min_iterations=min_iterations,
max_iterations=max_iterations,
time_for_benchmark=time_for_benchmark,
outer_iterations=outer_iterations,
packinfo=pack_info_name))
wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
single_run()
...@@ -39,9 +39,6 @@ def generate_lbm_storage_specification(generation_context, class_name: str, ...@@ -39,9 +39,6 @@ def generate_lbm_storage_specification(generation_context, class_name: str,
if nonuniform: if nonuniform:
kernels = cg.create_nonuniform_kernel_families(kernels_dict=kernels) kernels = cg.create_nonuniform_kernel_families(kernels_dict=kernels)
values_per_cell = len(stencil)
dimension = len(stencil[0])
# Pure storage specification # Pure storage specification
if not stencil_name: if not stencil_name:
raise ValueError("lb_method uses a stencil that is not supported in waLBerla") raise ValueError("lb_method uses a stencil that is not supported in waLBerla")
......
...@@ -147,7 +147,7 @@ MPIRank KnownSizeCommunication<Rb, Sb>::waitForNextReceive( std::map<MPIRank, Re ...@@ -147,7 +147,7 @@ MPIRank KnownSizeCommunication<Rb, Sb>::waitForNextReceive( std::map<MPIRank, Re
recvRequests_[ uint_c( requestIndex ) ] = MPI_REQUEST_NULL; recvRequests_[ uint_c( requestIndex ) ] = MPI_REQUEST_NULL;
MPIRank senderRank = status.MPI_SOURCE; MPIRank const senderRank = status.MPI_SOURCE;
WALBERLA_ASSERT_GREATER_EQUAL( senderRank, 0 ); WALBERLA_ASSERT_GREATER_EQUAL( senderRank, 0 );
......
...@@ -17,7 +17,7 @@ template<typename GPUFieldType> ...@@ -17,7 +17,7 @@ template<typename GPUFieldType>
class MemcpyPackInfo : public ::walberla::gpu::GeneratedGPUPackInfo class MemcpyPackInfo : public ::walberla::gpu::GeneratedGPUPackInfo
{ {
public: public:
MemcpyPackInfo( BlockDataID pdfsID_ ) : pdfsID(pdfsID_) {}; MemcpyPackInfo( BlockDataID fieldID_ ) : fieldID(fieldID_) {};
~MemcpyPackInfo() override = default; ~MemcpyPackInfo() override = default;
void pack (stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream) override; void pack (stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream) override;
...@@ -26,7 +26,7 @@ public: ...@@ -26,7 +26,7 @@ public:
uint_t size(stencil::Direction dir, IBlock * block) override; uint_t size(stencil::Direction dir, IBlock * block) override;
private: private:
BlockDataID pdfsID; BlockDataID fieldID;
uint_t numberOfGhostLayers_{0}; uint_t numberOfGhostLayers_{0};
bool communicateAllGhostLayers_{true}; bool communicateAllGhostLayers_{true};
......
...@@ -19,7 +19,7 @@ void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char ...@@ -19,7 +19,7 @@ void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char
IBlock * block, gpuStream_t stream) IBlock * block, gpuStream_t stream)
{ {
// Extract field data pointer from the block // Extract field data pointer from the block
const GPUFieldType * fieldPtr = block->getData< GPUFieldType >( pdfsID ); const GPUFieldType * fieldPtr = block->getData< GPUFieldType >( fieldID );
WALBERLA_ASSERT_NOT_NULLPTR( fieldPtr ) WALBERLA_ASSERT_NOT_NULLPTR( fieldPtr )
// //
cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) ); cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) );
...@@ -68,12 +68,9 @@ void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char ...@@ -68,12 +68,9 @@ void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char
template<typename GPUFieldType> template<typename GPUFieldType>
void MemcpyPackInfo< GPUFieldType >::communicateLocal( stencil::Direction dir, const IBlock* sender, IBlock* receiver, gpuStream_t stream ) void MemcpyPackInfo< GPUFieldType >::communicateLocal( stencil::Direction dir, const IBlock* sender, IBlock* receiver, gpuStream_t stream )
{ {
// WALBERLA_ABORT("The MemcpyPackInfo does not provide a thread safe local communication. Thus is can not be used in local mode. To use it set local useLocalCommunication to false in the communication scheme")
// Extract field data pointer from the block // Extract field data pointer from the block
const GPUFieldType * senderFieldPtr = sender->getData< GPUFieldType >( pdfsID ); const GPUFieldType * senderFieldPtr = sender->getData< GPUFieldType >( fieldID );
const GPUFieldType * receiverFieldPtr = receiver->getData< GPUFieldType >( pdfsID ); const GPUFieldType * receiverFieldPtr = receiver->getData< GPUFieldType >( fieldID );
WALBERLA_ASSERT_NOT_NULLPTR( senderFieldPtr ) WALBERLA_ASSERT_NOT_NULLPTR( senderFieldPtr )
WALBERLA_ASSERT_NOT_NULLPTR( receiverFieldPtr ) WALBERLA_ASSERT_NOT_NULLPTR( receiverFieldPtr )
...@@ -128,7 +125,7 @@ template<typename GPUFieldType> ...@@ -128,7 +125,7 @@ template<typename GPUFieldType>
void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned char * byte_buffer, void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned char * byte_buffer,
IBlock * block, gpuStream_t stream) IBlock * block, gpuStream_t stream)
{ {
GPUFieldType * fieldPtr = block->getData< GPUFieldType >( pdfsID ); GPUFieldType * fieldPtr = block->getData< GPUFieldType >( fieldID );
WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr) WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr)
cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) ); cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) );
...@@ -173,100 +170,13 @@ void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned cha ...@@ -173,100 +170,13 @@ void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned cha
template<typename GPUFieldType> template<typename GPUFieldType>
uint_t MemcpyPackInfo< GPUFieldType >::size(stencil::Direction dir, IBlock * block) uint_t MemcpyPackInfo< GPUFieldType >::size(stencil::Direction dir, IBlock * block)
{ {
auto pdfs = block->getData< GPUFieldType >(pdfsID); auto field = block->getData< GPUFieldType >(fieldID);
CellInterval ci; CellInterval ci;
cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( pdfs ) ); cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( field ) );
pdfs->getGhostRegion(dir, ci, nrOfGhostLayers, false); field->getGhostRegion(dir, ci, nrOfGhostLayers, false);
/* uint_t totalCells = ci.numCells() * field->fSize() * sizeof(typename GPUFieldType::value_type);
uint_t elementsPerCell = 0;
switch( dir )
{
case stencil::SW:
elementsPerCell = 1;
break;
case stencil::S:
elementsPerCell = 5;
break;
case stencil::W:
elementsPerCell = 5;
break;
case stencil::B:
elementsPerCell = 5;
break;
case stencil::T:
elementsPerCell = 5;
break;
case stencil::BN:
elementsPerCell = 1;
break;
case stencil::N:
elementsPerCell = 5;
break;
case stencil::TE:
elementsPerCell = 1;
break;
case stencil::E:
elementsPerCell = 5;
break;
case stencil::BE:
elementsPerCell = 1;
break;
case stencil::SE:
elementsPerCell = 1;
break;
case stencil::C:
elementsPerCell = 1;
break;
case stencil::TN:
elementsPerCell = 1;
break;
case stencil::TS:
elementsPerCell = 1;
break;
case stencil::NE:
elementsPerCell = 1;
break;
case stencil::BW:
elementsPerCell = 1;
break;
case stencil::NW:
elementsPerCell = 1;
break;
case stencil::BS:
elementsPerCell = 1;
break;
case stencil::TW:
elementsPerCell = 1;
break;
default:
elementsPerCell = 0;
}
return ci.numCells() * elementsPerCell * sizeof(typename GPUFieldType::value_type);
*/
uint_t totalCells = ci.numCells() * pdfs->fSize() * sizeof(typename GPUFieldType::value_type);
return totalCells; return totalCells;
} }
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
//! \author Martin Bauer <martin.bauer@fau.de> //! \author Martin Bauer <martin.bauer@fau.de>
// //
//====================================================================================================================== //======================================================================================================================
#pragma once #pragma once
#include "blockforest/StructuredBlockForest.h" #include "blockforest/StructuredBlockForest.h"
...@@ -30,48 +29,52 @@ ...@@ -30,48 +29,52 @@
#include "stencil/Directions.h" #include "stencil/Directions.h"
#include <thread>
#include "gpu/GPURAII.h"
#include "gpu/GPUWrapper.h" #include "gpu/GPUWrapper.h"
#include "gpu/ParallelStreams.h"
#include "gpu/communication/CustomMemoryBuffer.h" #include "gpu/communication/CustomMemoryBuffer.h"
#include "gpu/communication/GeneratedGPUPackInfo.h" #include "gpu/communication/GeneratedGPUPackInfo.h"
#include <thread>
namespace walberla { namespace walberla {
namespace gpu namespace gpu
{ {
namespace communication { namespace communication {
template<typename Stencil> template<typename Stencil>
class UniformGPUScheme class UniformGPUScheme
{ {
public: public:
explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf, explicit UniformGPUScheme( const weak_ptr<StructuredBlockForest>& bf,
bool sendDirectlyFromGPU = false, bool sendDirectlyFromGPU = false,
bool useLocalCommunication = true, bool useLocalCommunication = true,
const int tag = 5432 ); const int tag = 5432 );
explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf, explicit UniformGPUScheme( const weak_ptr<StructuredBlockForest>& bf,
const Set<SUID> & requiredBlockSelectors, const Set<SUID> & requiredBlockSelectors,
const Set<SUID> & incompatibleBlockSelectors, const Set<SUID> & incompatibleBlockSelectors,
bool sendDirectlyFromGPU = false, bool sendDirectlyFromGPU = false,
bool useLocalCommunication = true, bool useLocalCommunication = true,
const int tag = 5432 ); const int tag = 5432 );
~UniformGPUScheme();
void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi ); void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi );
void startCommunication( gpuStream_t stream = nullptr); void startCommunication();
void wait( gpuStream_t stream = nullptr); void wait();
void operator()( gpuStream_t stream = nullptr ) { communicate( stream ); } void operator()( gpuStream_t stream = nullptr ) { communicate( stream ); }
inline void communicate( gpuStream_t stream = nullptr ) { startCommunication(stream); wait(stream); } inline void communicate( gpuStream_t stream = nullptr ) { startCommunication(stream); wait(stream); }
std::function<void()> getCommunicateFunctor( gpuStream_t stream = nullptr ); std::function<void()> getCommunicateFunctor();
std::function<void()> getStartCommunicateFunctor( gpuStream_t stream = nullptr ); std::function<void()> getStartCommunicateFunctor();
std::function<void()> getWaitFunctor( gpuStream_t stream = nullptr ); std::function<void()> getWaitFunctor();
void enableTiming( const shared_ptr<WcTimingPool> & timingPool )
{
timing_ = true;
timingPool_ = timingPool;
}
private: private:
void setupCommunication(); void setupCommunication();
...@@ -92,8 +95,6 @@ namespace communication { ...@@ -92,8 +95,6 @@ namespace communication {
std::vector<shared_ptr<GeneratedGPUPackInfo> > packInfos_; std::vector<shared_ptr<GeneratedGPUPackInfo> > packInfos_;
ParallelStreams parallelSectionManager_;
struct Header struct Header
{ {
BlockID blockId; BlockID blockId;
...@@ -103,6 +104,11 @@ namespace communication { ...@@ -103,6 +104,11 @@ namespace communication {
Set<SUID> requiredBlockSelectors_; Set<SUID> requiredBlockSelectors_;
Set<SUID> incompatibleBlockSelectors_; Set<SUID> incompatibleBlockSelectors_;
bool timing_{false};
shared_ptr<WcTimingPool> timingPool_;
gpuStream_t streams_[Stencil::Q];
}; };
......
...@@ -18,9 +18,6 @@ ...@@ -18,9 +18,6 @@
//! \author Martin Bauer <martin.bauer@fau.de> //! \author Martin Bauer <martin.bauer@fau.de>
// //
//====================================================================================================================== //======================================================================================================================
#include "gpu/ParallelStreams.h"
namespace walberla { namespace walberla {
namespace gpu namespace gpu
{ {
...@@ -28,7 +25,7 @@ namespace communication { ...@@ -28,7 +25,7 @@ namespace communication {
template<typename Stencil> template<typename Stencil>
UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf, UniformGPUScheme<Stencil>::UniformGPUScheme( const weak_ptr <StructuredBlockForest>& bf,
bool sendDirectlyFromGPU, bool sendDirectlyFromGPU,
bool useLocalCommunication, bool useLocalCommunication,
const int tag ) const int tag )
...@@ -39,7 +36,6 @@ namespace communication { ...@@ -39,7 +36,6 @@ namespace communication {
useLocalCommunication_(useLocalCommunication), useLocalCommunication_(useLocalCommunication),
bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ), bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ),
bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ), bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ),
parallelSectionManager_( -1 ),
requiredBlockSelectors_( Set<SUID>::emptySet() ), requiredBlockSelectors_( Set<SUID>::emptySet() ),
incompatibleBlockSelectors_( Set<SUID>::emptySet() ) incompatibleBlockSelectors_( Set<SUID>::emptySet() )
{ {
...@@ -49,10 +45,15 @@ namespace communication { ...@@ -49,10 +45,15 @@ namespace communication {
WALBERLA_CHECK(!sendDirectlyFromGPU) WALBERLA_CHECK(!sendDirectlyFromGPU)
#endif #endif
} }
for (uint_t i = 0; i < Stencil::Q; ++i)
{
WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i]))
}
} }
template<typename Stencil> template<typename Stencil>
UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf, UniformGPUScheme<Stencil>::UniformGPUScheme( const weak_ptr <StructuredBlockForest>& bf,
const Set<SUID> & requiredBlockSelectors, const Set<SUID> & requiredBlockSelectors,
const Set<SUID> & incompatibleBlockSelectors, const Set<SUID> & incompatibleBlockSelectors,
bool sendDirectlyFromGPU, bool sendDirectlyFromGPU,
...@@ -65,7 +66,6 @@ namespace communication { ...@@ -65,7 +66,6 @@ namespace communication {
useLocalCommunication_(useLocalCommunication), useLocalCommunication_(useLocalCommunication),
bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ), bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ),
bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ), bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ),
parallelSectionManager_( -1 ),
requiredBlockSelectors_( requiredBlockSelectors ), requiredBlockSelectors_( requiredBlockSelectors ),
incompatibleBlockSelectors_( incompatibleBlockSelectors ) incompatibleBlockSelectors_( incompatibleBlockSelectors )
{ {
...@@ -75,11 +75,25 @@ namespace communication { ...@@ -75,11 +75,25 @@ namespace communication {
WALBERLA_CHECK(!sendDirectlyFromGPU) WALBERLA_CHECK(!sendDirectlyFromGPU)
#endif #endif
} }
for (uint_t i = 0; i < Stencil::Q; ++i)
{
WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i]))
}
} }
template< typename Stencil >
UniformGPUScheme< Stencil >::~UniformGPUScheme()
{
for (uint_t i = 0; i < Stencil::Q; ++i)
{
WALBERLA_GPU_CHECK(gpuStreamDestroy(streams_[i]))
}
}
template<typename Stencil> template<typename Stencil>
void UniformGPUScheme<Stencil>::startCommunication( gpuStream_t stream ) void UniformGPUScheme<Stencil>::startCommunication()
{ {
WALBERLA_ASSERT( !communicationInProgress_ ) WALBERLA_ASSERT( !communicationInProgress_ )
auto forest = blockForest_.lock(); auto forest = blockForest_.lock();
...@@ -99,9 +113,11 @@ namespace communication { ...@@ -99,9 +113,11 @@ namespace communication {
for( auto it : headers_ ) for( auto it : headers_ )
bufferSystemGPU_.sendBuffer( it.first ).clear(); bufferSystemGPU_.sendBuffer( it.first ).clear();
if(timing_)
timingPool_->operator[]("UniformGPUScheme->startCommunication").start();
// Start filling send buffers // Start filling send buffers
{ {
auto parallelSection = parallelSectionManager_.parallelSection( stream );
for( auto &iBlock : *forest ) for( auto &iBlock : *forest )
{ {
auto senderBlock = dynamic_cast< Block * >( &iBlock ); auto senderBlock = dynamic_cast< Block * >( &iBlock );
...@@ -124,7 +140,7 @@ namespace communication { ...@@ -124,7 +140,7 @@ namespace communication {
auto receiverBlock = dynamic_cast< Block * >( forest->getBlock( senderBlock->getNeighborId( neighborIdx, uint_t(0) )) ); auto receiverBlock = dynamic_cast< Block * >( forest->getBlock( senderBlock->getNeighborId( neighborIdx, uint_t(0) )) );
for (auto& pi : packInfos_) for (auto& pi : packInfos_)
{ {
pi->communicateLocal(*dir, senderBlock, receiverBlock, stream); pi->communicateLocal(*dir, senderBlock, receiverBlock, streams_[*dir]);
} }
} }
else else
...@@ -133,26 +149,29 @@ namespace communication { ...@@ -133,26 +149,29 @@ namespace communication {
for( auto &pi : packInfos_ ) for( auto &pi : packInfos_ )
{ {
parallelSection.run([&](auto s) {
auto size = pi->size( *dir, senderBlock ); auto size = pi->size( *dir, senderBlock );
auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size ); auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size );
WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ) WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
pi->pack( *dir, gpuDataPtr, senderBlock, s ); pi->pack( *dir, gpuDataPtr, senderBlock, streams_[*dir] );
if( !sendFromGPU_ ) if( !sendFromGPU_ )
{ {
auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size ); auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size );
WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr ) WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr )
WALBERLA_GPU_CHECK( gpuMemcpyAsync( cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, s )) WALBERLA_GPU_CHECK( gpuMemcpyAsync( cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[*dir] ))
} }
});
} }
} }
} }
} }
} }
// wait for packing to finish // wait for packing to finish
WALBERLA_GPU_CHECK( gpuStreamSynchronize( stream ) ); for (uint_t i = 0; i < Stencil::Q; ++i)
{
WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
}
if(timing_)
timingPool_->operator[]("UniformGPUScheme->startCommunication").end();
if( sendFromGPU_ ) if( sendFromGPU_ )
bufferSystemGPU_.sendAll(); bufferSystemGPU_.sendAll();
...@@ -164,15 +183,17 @@ namespace communication { ...@@ -164,15 +183,17 @@ namespace communication {
template<typename Stencil> template<typename Stencil>
void UniformGPUScheme<Stencil>::wait( gpuStream_t stream ) void UniformGPUScheme<Stencil>::wait()
{ {
WALBERLA_ASSERT( communicationInProgress_ ) WALBERLA_ASSERT( communicationInProgress_ )
auto forest = blockForest_.lock(); auto forest = blockForest_.lock();
if(timing_)
timingPool_->operator[]("UniformGPUScheme->wait").start();
if( sendFromGPU_ ) if( sendFromGPU_ )
{ {
auto parallelSection = parallelSectionManager_.parallelSection( stream );
for( auto recvInfo = bufferSystemGPU_.begin(); recvInfo != bufferSystemGPU_.end(); ++recvInfo ) for( auto recvInfo = bufferSystemGPU_.begin(); recvInfo != bufferSystemGPU_.end(); ++recvInfo )
{ {
recvInfo.buffer().clear(); recvInfo.buffer().clear();
...@@ -185,16 +206,13 @@ namespace communication { ...@@ -185,16 +206,13 @@ namespace communication {
auto size = pi->size( header.dir, block ); auto size = pi->size( header.dir, block );
auto gpuDataPtr = recvInfo.buffer().advanceNoResize( size ); auto gpuDataPtr = recvInfo.buffer().advanceNoResize( size );
WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ) WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
parallelSection.run([&](auto s) { pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, streams_[stencil::inverseDir[header.dir]] );
pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s );
});
} }
} }
} }
} }
else else
{ {
auto parallelSection = parallelSectionManager_.parallelSection( stream );
for( auto recvInfo = bufferSystemCPU_.begin(); recvInfo != bufferSystemCPU_.end(); ++recvInfo ) for( auto recvInfo = bufferSystemCPU_.begin(); recvInfo != bufferSystemCPU_.end(); ++recvInfo )
{ {
auto &gpuBuffer = bufferSystemGPU_.sendBuffer( recvInfo.rank()); auto &gpuBuffer = bufferSystemGPU_.sendBuffer( recvInfo.rank());
...@@ -211,17 +229,20 @@ namespace communication { ...@@ -211,17 +229,20 @@ namespace communication {
auto gpuDataPtr = gpuBuffer.advanceNoResize( size ); auto gpuDataPtr = gpuBuffer.advanceNoResize( size );
WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr ) WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr )
WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ) WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
parallelSection.run([&](auto s) {
WALBERLA_GPU_CHECK( gpuMemcpyAsync( gpuDataPtr, cpuDataPtr, size, WALBERLA_GPU_CHECK( gpuMemcpyAsync( gpuDataPtr, cpuDataPtr, size,
gpuMemcpyHostToDevice, s )) gpuMemcpyHostToDevice, streams_[stencil::inverseDir[header.dir]] ))
pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s ); pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, streams_[stencil::inverseDir[header.dir]] );
});
} }
} }
} }
} }
WALBERLA_GPU_CHECK( gpuDeviceSynchronize() ) for (uint_t i = 0; i < Stencil::Q; ++i)
{
WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
}
if(timing_)
timingPool_->operator[]("UniformGPUScheme->wait").end();
communicationInProgress_ = false; communicationInProgress_ = false;
} }
...@@ -309,21 +330,21 @@ namespace communication { ...@@ -309,21 +330,21 @@ namespace communication {
} }
template< typename Stencil > template< typename Stencil >
std::function<void()> UniformGPUScheme<Stencil>::getCommunicateFunctor(gpuStream_t stream) std::function<void()> UniformGPUScheme<Stencil>::getCommunicateFunctor()
{ {
return [this, stream]() { communicate( stream ); }; return [this]() { communicate(); };
} }
template< typename Stencil > template< typename Stencil >
std::function<void()> UniformGPUScheme<Stencil>::getStartCommunicateFunctor(gpuStream_t stream) std::function<void()> UniformGPUScheme<Stencil>::getStartCommunicateFunctor()
{ {
return [this, stream]() { startCommunication( stream ); }; return [this]() { startCommunication(); };
} }
template< typename Stencil > template< typename Stencil >
std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor(cudaStream_t stream) std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor()
{ {
return [this, stream]() { wait( stream ); }; return [this]() { wait(); };
} }
} // namespace communication } // namespace communication
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment