From 6ce8f928a2571473a4820a76ce29c561c92ad22e Mon Sep 17 00:00:00 2001 From: Richard Angersbach <richard.angersbach@fau.de> Date: Fri, 30 Jun 2023 15:09:59 +0200 Subject: [PATCH] Missing synchronization for time measurements with CUDA --- .../FlowAroundSphereCodeGen/CMakeLists.txt | 8 +- .../PhaseFieldAllenCahn/CMakeLists.txt | 4 +- apps/tutorials/codegen/CMakeLists.txt | 5 +- src/CMakeLists.txt | 4 +- src/core/mpi/Datatype.h | 12 +- src/core/mpi/MPIWrapper.h | 15 +- src/core/timing/CMakeLists.txt | 1 + src/core/timing/DeviceSynchronizePolicy.h | 84 +++++ src/core/timing/Timer.h | 6 +- src/core/timing/TimingNode.cpp | 1 + src/core/timing/TimingNode.h | 1 + src/core/timing/TimingPool.cpp | 1 + src/core/timing/TimingPool.h | 1 + src/core/timing/TimingTree.cpp | 1 + src/core/timing/TimingTree.h | 1 + src/gpu/AlignedAllocation.cpp | 15 +- src/gpu/CMakeLists.txt | 1 + src/gpu/DeviceSelectMPI.cpp | 59 ++-- src/gpu/DeviceSelectMPI.h | 1 + src/gpu/DeviceWrapper.h | 292 ++++++++++++++++++ src/gpu/ErrorChecking.h | 6 +- src/gpu/FieldAccessor.h | 2 +- src/gpu/FieldCopy.h | 176 +++++------ src/gpu/FieldIndexing.h | 1 + src/gpu/FieldIndexing.impl.h | 24 +- src/gpu/FieldIndexing3D.impl.h | 24 +- src/gpu/FieldIndexingXYZ.h | 3 + src/gpu/FieldIndexingXYZ.impl.h | 24 +- src/gpu/GPUCopy.h | 3 +- src/gpu/GPUField.h | 6 +- src/gpu/GPUField.impl.h | 35 ++- src/gpu/HostFieldAllocator.h | 30 +- src/gpu/Kernel.h | 5 +- src/gpu/NVTX.h | 69 +++-- src/gpu/ParallelStreams.cpp | 59 ++-- src/gpu/communication/UniformGPUScheme.impl.h | 2 +- .../gpu/BasicRecursiveTimeStepGPU.h | 2 +- .../gpu/BasicRecursiveTimeStepGPU.impl.h | 2 +- .../refinement/BasicRecursiveTimeStep.h | 2 +- .../refinement/BasicRecursiveTimeStep.impl.h | 2 +- src/timeloop/CMakeLists.txt | 2 - src/timeloop/SelectableFunctionCreators.h | 2 +- src/timeloop/SweepTimeloop.h | 14 +- ...SweepTimeloop.cpp => SweepTimeloop.impl.h} | 15 +- src/timeloop/Timeloop.h | 22 +- .../{Timeloop.cpp => Timeloop.impl.h} | 58 ++-- tests/pe/Refinement.cpp | 2 +- 47 files changed, 786 insertions(+), 319 deletions(-) create mode 100644 src/core/timing/DeviceSynchronizePolicy.h create mode 100644 src/gpu/DeviceWrapper.h rename src/timeloop/{SweepTimeloop.cpp => SweepTimeloop.impl.h} (90%) rename src/timeloop/{Timeloop.cpp => Timeloop.impl.h} (83%) diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt b/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt index 40a17bda2..17cfd93fd 100644 --- a/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt +++ b/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt @@ -11,10 +11,10 @@ waLBerla_generate_target_from_python(NAME FlowAroundSphereGenerated FlowAroundSphereCodeGen_PackInfoOdd.${CODEGEN_FILE_SUFFIX} FlowAroundSphereCodeGen_PackInfoOdd.h FlowAroundSphereCodeGen_InfoHeader.h) -if (WALBERLA_BUILD_WITH_CUDA) - waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp - DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated) +if (WALBERLA_BUILD_WITH_GPU_SUPPORT ) + waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp + DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated) else () waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated) -endif (WALBERLA_BUILD_WITH_CUDA) +endif (WALBERLA_BUILD_WITH_GPU_SUPPORT ) \ No newline at end of file diff --git a/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt b/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt index 52d29a0fb..1b530d61a 100644 --- a/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt +++ b/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt @@ -12,7 +12,7 @@ waLBerla_generate_target_from_python(NAME BenchmarkPhaseFieldCodeGen PackInfo_velocity_based_distributions.${CODEGEN_FILE_SUFFIX} PackInfo_velocity_based_distributions.h GenDefines.h) -if (WALBERLA_BUILD_WITH_CUDA) +if (WALBERLA_BUILD_WITH_GPU_SUPPORT ) waLBerla_add_executable(NAME benchmark_multiphase FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py DEPENDS blockforest core gpu field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen) @@ -20,5 +20,5 @@ else () waLBerla_add_executable(NAME benchmark_multiphase FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py DEPENDS blockforest core field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen) -endif (WALBERLA_BUILD_WITH_CUDA) +endif (WALBERLA_BUILD_WITH_GPU_SUPPORT ) diff --git a/apps/tutorials/codegen/CMakeLists.txt b/apps/tutorials/codegen/CMakeLists.txt index 2a56a5b67..4b50efaa3 100644 --- a/apps/tutorials/codegen/CMakeLists.txt +++ b/apps/tutorials/codegen/CMakeLists.txt @@ -24,7 +24,6 @@ if( WALBERLA_BUILD_WITH_CODEGEN ) DEPENDS blockforest core domain_decomposition field geometry timeloop lbm stencil vtk 02_LBMLatticeModelGenerationPython ) # Tutorial 3: Advanced lbmpy Code Generation - walberla_generate_target_from_python( NAME 03_AdvancedLBMCodegenPython FILE 03_AdvancedLBMCodegen.py OUT_FILES CumulantMRTSweep.${CODEGEN_FILE_SUFFIX} CumulantMRTSweep.h @@ -34,8 +33,8 @@ if( WALBERLA_BUILD_WITH_CODEGEN ) if(WALBERLA_BUILD_WITH_GPU_SUPPORT) walberla_add_executable ( NAME 03_AdvancedLBMCodegenApp - FILES 03_AdvancedLBMCodegen.cpp - DEPENDS blockforest gpu core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython ) + FILES 03_AdvancedLBMCodegen.cpp + DEPENDS blockforest gpu core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython ) else() walberla_add_executable ( NAME 03_AdvancedLBMCodegenApp FILES 03_AdvancedLBMCodegen.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 92b465e32..d49a1e63b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -24,9 +24,7 @@ add_subdirectory( blockforest ) add_subdirectory( boundary ) add_subdirectory( communication ) add_subdirectory( core ) -if ( WALBERLA_BUILD_WITH_GPU_SUPPORT ) - add_subdirectory(gpu) -endif() +add_subdirectory(gpu) add_subdirectory( domain_decomposition ) add_subdirectory( executiontree ) if ( WALBERLA_BUILD_WITH_FFT AND FFTW3_FOUND ) diff --git a/src/core/mpi/Datatype.h b/src/core/mpi/Datatype.h index 80b7931ef..f717cb6d9 100644 --- a/src/core/mpi/Datatype.h +++ b/src/core/mpi/Datatype.h @@ -40,23 +40,17 @@ namespace mpi { Datatype( MPI_Datatype datatype) : mpiDatatype_( datatype ) { -#ifdef WALBERLA_BUILD_WITH_MPI - MPI_Type_commit( &mpiDatatype_ ); -#endif + WALBERLA_MPI_SECTION() { MPI_Type_commit( &mpiDatatype_ ); } } void init( MPI_Datatype datatype ) { mpiDatatype_ = datatype; -#ifdef WALBERLA_BUILD_WITH_MPI - MPI_Type_commit( &mpiDatatype_ ); -#endif + WALBERLA_MPI_SECTION() { MPI_Type_commit( &mpiDatatype_ ); } } ~Datatype() { -#ifdef WALBERLA_BUILD_WITH_MPI - MPI_Type_free( & mpiDatatype_ ); -#endif + WALBERLA_MPI_SECTION() { MPI_Type_free( & mpiDatatype_ ); } } operator MPI_Datatype() const { diff --git a/src/core/mpi/MPIWrapper.h b/src/core/mpi/MPIWrapper.h index cd250cb97..6b406c631 100644 --- a/src/core/mpi/MPIWrapper.h +++ b/src/core/mpi/MPIWrapper.h @@ -120,6 +120,8 @@ struct MPI_Status const int MPI_COMM_NULL = 0; const int MPI_COMM_WORLD = 1; +const int MPI_COMM_TYPE_SHARED = 0; + const int MPI_SUCCESS = 1; @@ -202,11 +204,14 @@ inline int MPI_Comm_size( MPI_Comm, int* ) { WALBERLA_MPI_FUNCTION_ERROR } inline int MPI_Comm_rank( MPI_Comm, int* ) { WALBERLA_MPI_FUNCTION_ERROR } inline int MPI_Comm_get_name( MPI_Comm, char*, int* ) { WALBERLA_MPI_FUNCTION_ERROR } -inline int MPI_Comm_group ( MPI_Comm, MPI_Group* ) { WALBERLA_MPI_FUNCTION_ERROR } -inline int MPI_Comm_create( MPI_Comm, MPI_Group, MPI_Comm* ) { WALBERLA_MPI_FUNCTION_ERROR } -inline int MPI_Comm_free ( MPI_Comm* ) { WALBERLA_MPI_FUNCTION_ERROR } -inline int MPI_Comm_dup ( MPI_Comm, MPI_Comm *) { WALBERLA_MPI_FUNCTION_ERROR } -inline int MPI_Comm_split ( MPI_Comm, int, int, MPI_Comm *) { WALBERLA_MPI_FUNCTION_ERROR } +inline int MPI_Info_create ( MPI_Info * ) { WALBERLA_MPI_FUNCTION_ERROR } + +inline int MPI_Comm_group ( MPI_Comm, MPI_Group* ) { WALBERLA_MPI_FUNCTION_ERROR } +inline int MPI_Comm_create( MPI_Comm, MPI_Group, MPI_Comm* ) { WALBERLA_MPI_FUNCTION_ERROR } +inline int MPI_Comm_free ( MPI_Comm* ) { WALBERLA_MPI_FUNCTION_ERROR } +inline int MPI_Comm_dup ( MPI_Comm, MPI_Comm *) { WALBERLA_MPI_FUNCTION_ERROR } +inline int MPI_Comm_split ( MPI_Comm, int, int, MPI_Comm *) { WALBERLA_MPI_FUNCTION_ERROR } +inline int MPI_Comm_split_type ( MPI_Comm, int, int, MPI_Info, MPI_Comm *) { WALBERLA_MPI_FUNCTION_ERROR } inline int MPI_Cart_create( MPI_Comm, int, int*, int*, int, MPI_Comm* ) { WALBERLA_MPI_FUNCTION_ERROR } diff --git a/src/core/timing/CMakeLists.txt b/src/core/timing/CMakeLists.txt index b949b2eeb..1de08d962 100644 --- a/src/core/timing/CMakeLists.txt +++ b/src/core/timing/CMakeLists.txt @@ -16,4 +16,5 @@ target_sources( core TimingTree.cpp TimingTree.h WcPolicy.h + DeviceSynchronizePolicy.h ) diff --git a/src/core/timing/DeviceSynchronizePolicy.h b/src/core/timing/DeviceSynchronizePolicy.h new file mode 100644 index 000000000..7c494e48d --- /dev/null +++ b/src/core/timing/DeviceSynchronizePolicy.h @@ -0,0 +1,84 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file DeviceSynchronizePolicy.h +//! \ingroup core +//! \author Richard Angersbach +//! \brief Gpu Timing Policy +// +//====================================================================================================================== + +#pragma once + +#include "gpu/DeviceWrapper.h" + +#include "Time.h" + +namespace walberla +{ +namespace timing +{ + +//====================================================================================================================== +// +// CLASS DEFINITION +// +//====================================================================================================================== + +//********************************************************************************************************************** +/*!\brief Timing policy for the measurement of the GPU time. +// \ingroup timing +// +// The DeviceSynchronizePolicy class represents the timing policy for GPU time measurements that can be used +// in combination with the Timer class template. This combination is realized with the DeviceSynchronizePolicy +// type definition. +// This class uses device synchronization internally and is therefore not suited for CUDA +// applications with overlapping kernels. +*/ +struct DeviceSynchronizePolicy +{ + public: + //**Timing functions**************************************************************************** + /*!\name Timing functions */ + //@{ + static inline double getTimestamp(); + //@} + //******************************************************************************************************************* +}; +//********************************************************************************************************************** + +//====================================================================================================================== +// +// TIMING FUNCTIONS +// +//====================================================================================================================== + +//********************************************************************************************************************** +/*!\brief Returns a timestamp of the current GPU time in seconds. Uses wall clock time and device synchronization +internally. +// +// \return GPU timestamp in seconds. +*/ +inline double DeviceSynchronizePolicy::getTimestamp() +{ + // synchronize device before getting timestamp + WALBERLA_DEVICE_SECTION() { gpuDeviceSynchronize(); } + + return getWcTime(); +} +//********************************************************************************************************************** + +} // namespace timing +} // namespace walberla diff --git a/src/core/timing/Timer.h b/src/core/timing/Timer.h index 32c1e7f30..9f7c3f97d 100644 --- a/src/core/timing/Timer.h +++ b/src/core/timing/Timer.h @@ -25,14 +25,17 @@ #pragma once #include "CpuPolicy.h" +#include "DeviceSynchronizePolicy.h" #include "ReduceType.h" #include "WcPolicy.h" -#include "core/DataTypes.h" +#include "core/DataTypes.h" #include "core/mpi/RecvBuffer.h" #include "core/mpi/Reduce.h" #include "core/mpi/SendBuffer.h" +#include "gpu/DeviceWrapper.h" + #include <iomanip> #include <iostream> #include <limits> @@ -590,6 +593,7 @@ mpi::GenericRecvBuffer<T>& operator>>( mpi::GenericRecvBuffer<T> & buf, Timer<TP } //namespace timing using CpuTimer = timing::Timer<timing::CpuPolicy>; +using DeviceSynchronizeTimer = timing::Timer<timing::DeviceSynchronizePolicy>; using WcTimer = timing::Timer<timing::WcPolicy>; } // namespace walberla diff --git a/src/core/timing/TimingNode.cpp b/src/core/timing/TimingNode.cpp index 3e0cf4df5..c75cd1414 100644 --- a/src/core/timing/TimingNode.cpp +++ b/src/core/timing/TimingNode.cpp @@ -29,6 +29,7 @@ namespace timing { // Explicit instantiation template struct TimingNode<WcPolicy>; +template struct TimingNode<DeviceSynchronizePolicy>; template struct TimingNode<CpuPolicy>; } // namespace timing diff --git a/src/core/timing/TimingNode.h b/src/core/timing/TimingNode.h index 5b9c29aa2..0b6326e71 100644 --- a/src/core/timing/TimingNode.h +++ b/src/core/timing/TimingNode.h @@ -494,6 +494,7 @@ void addRemainderNodes(timing::TimingNode<TP> &tn) { } using WcTimingNode = timing::TimingNode<timing::WcPolicy>; +using DeviceSynchronizeTimingNode = timing::TimingNode<timing::DeviceSynchronizePolicy>; using CpuTimingNode = timing::TimingNode<timing::CpuPolicy>; } diff --git a/src/core/timing/TimingPool.cpp b/src/core/timing/TimingPool.cpp index 7539fffe3..28cf668f2 100644 --- a/src/core/timing/TimingPool.cpp +++ b/src/core/timing/TimingPool.cpp @@ -474,6 +474,7 @@ void TimingPool<TP>::clear () // Explicit instantiation template class TimingPool<WcPolicy>; +template class TimingPool<DeviceSynchronizePolicy>; template class TimingPool<CpuPolicy>; diff --git a/src/core/timing/TimingPool.h b/src/core/timing/TimingPool.h index 5e41c14d7..2d5ed0996 100644 --- a/src/core/timing/TimingPool.h +++ b/src/core/timing/TimingPool.h @@ -249,5 +249,6 @@ namespace timing { namespace walberla { using WcTimingPool = timing::TimingPool<timing::WcPolicy>; + using DeviceSynchronizeTimingPool = timing::TimingPool<timing::DeviceSynchronizePolicy>; using CpuTimingPool = timing::TimingPool<timing::CpuPolicy>; } diff --git a/src/core/timing/TimingTree.cpp b/src/core/timing/TimingTree.cpp index fc891c31a..14cd47232 100644 --- a/src/core/timing/TimingTree.cpp +++ b/src/core/timing/TimingTree.cpp @@ -29,6 +29,7 @@ namespace timing { // Explicit instantiation template class TimingTree<WcPolicy>; +template class TimingTree<DeviceSynchronizePolicy>; template class TimingTree<CpuPolicy>; } // namespace timing diff --git a/src/core/timing/TimingTree.h b/src/core/timing/TimingTree.h index 63d85d243..5cf06167e 100644 --- a/src/core/timing/TimingTree.h +++ b/src/core/timing/TimingTree.h @@ -259,5 +259,6 @@ TimingTree< TP > TimingTree< TP >::getCopyWithRemainder() const } using WcTimingTree = timing::TimingTree<timing::WcPolicy>; +using DeviceSynchronizeTimingTree = timing::TimingTree<timing::DeviceSynchronizePolicy>; using CpuTimingTree = timing::TimingTree<timing::CpuPolicy>; } diff --git a/src/gpu/AlignedAllocation.cpp b/src/gpu/AlignedAllocation.cpp index 2a2bee41c..65e58c79a 100644 --- a/src/gpu/AlignedAllocation.cpp +++ b/src/gpu/AlignedAllocation.cpp @@ -20,6 +20,7 @@ //====================================================================================================================== #include "AlignedAllocation.h" +#include "gpu/DeviceWrapper.h" #include "gpu/ErrorChecking.h" #include "core/debug/CheckFunctions.h" #include "core/debug/Debug.h" @@ -34,6 +35,11 @@ namespace gpu void *allocate_aligned_with_offset( uint_t size, uint_t alignment, uint_t offset ) { + WALBERLA_NON_DEVICE_SECTION() + { + WALBERLA_ABORT(__FUNCTION__ << "Using GPU method without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.") + } + // With 0 alignment this function makes no sense // use normal malloc instead WALBERLA_ASSERT_GREATER( alignment, 0 ) @@ -50,8 +56,8 @@ namespace gpu return result; } - void *pa; // pointer to allocated memory - void *ptr; // pointer to usable aligned memory + void *pa = nullptr; // pointer to allocated memory + void *ptr = nullptr; // pointer to usable aligned memory WALBERLA_GPU_CHECK( gpuMalloc( &pa, size + alignment )); WALBERLA_CHECK_EQUAL(size_t(pa) % alignment, 0 , "GPU malloc did not return memory with requested alignment"); @@ -65,6 +71,11 @@ namespace gpu void free_aligned_with_offset( void *ptr ) { + WALBERLA_NON_DEVICE_SECTION() + { + WALBERLA_ABORT(__FUNCTION__ << "Using GPU method without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.") + } + // assume that pointer to real allocated chunk is stored just before // chunk that was given to user WALBERLA_GPU_CHECK( gpuFree( freePointers_[ptr] )); diff --git a/src/gpu/CMakeLists.txt b/src/gpu/CMakeLists.txt index a8e58ab49..fb6810d4e 100644 --- a/src/gpu/CMakeLists.txt +++ b/src/gpu/CMakeLists.txt @@ -20,6 +20,7 @@ target_sources( gpu AddGPUFieldToStorage.impl.h GPUField.h GPUWrapper.h + DeviceWrapper.h FieldAccessor3D.h DeviceSelectMPI.h HostFieldAllocator.h diff --git a/src/gpu/DeviceSelectMPI.cpp b/src/gpu/DeviceSelectMPI.cpp index 52454653b..81b87b3de 100644 --- a/src/gpu/DeviceSelectMPI.cpp +++ b/src/gpu/DeviceSelectMPI.cpp @@ -38,38 +38,41 @@ void selectDeviceBasedOnMpiRank() { void selectDeviceBasedOnMpiRank() { -#ifdef WALBERLA_BUILD_WITH_MPI - int deviceCount; - WALBERLA_GPU_CHECK( gpuGetDeviceCount( &deviceCount )) - WALBERLA_LOG_INFO_ON_ROOT( "Selecting device depending on MPI Rank" ) - MPI_Info info; - MPI_Info_create( &info ); - MPI_Comm newCommunicator; - MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, info, &newCommunicator ); + WALBERLA_DEVICE_SECTION() + { + WALBERLA_MPI_SECTION() + { + int deviceCount; + WALBERLA_GPU_CHECK(gpuGetDeviceCount(&deviceCount)) + WALBERLA_LOG_INFO_ON_ROOT("Selecting device depending on MPI Rank") - int processesOnNode; - int rankOnNode; - MPI_Comm_size( newCommunicator, &processesOnNode ); - MPI_Comm_rank( newCommunicator, &rankOnNode ); + MPI_Info info; + MPI_Info_create(&info); + MPI_Comm newCommunicator; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, info, &newCommunicator); - if ( deviceCount == processesOnNode ) - { - WALBERLA_GPU_CHECK( gpuSetDevice( rankOnNode )) - } - else if ( deviceCount > processesOnNode ) - { - WALBERLA_LOG_WARNING( "Not using all available GPUs on node. Processes on node: " - << processesOnNode << ", available GPUs on node: " << deviceCount ) - WALBERLA_GPU_CHECK( gpuSetDevice( rankOnNode )) - } - else - { - WALBERLA_LOG_WARNING( "Too many processes started per node - should be one per GPU. Number of processes per node " - << processesOnNode << ", available GPUs on node " << deviceCount ) - WALBERLA_GPU_CHECK( gpuSetDevice( rankOnNode % deviceCount )) + int processesOnNode; + int rankOnNode; + MPI_Comm_size(newCommunicator, &processesOnNode); + MPI_Comm_rank(newCommunicator, &rankOnNode); + + if (deviceCount == processesOnNode) { WALBERLA_GPU_CHECK(gpuSetDevice(rankOnNode)) } + else if (deviceCount > processesOnNode) + { + WALBERLA_LOG_WARNING("Not using all available GPUs on node. Processes on node: " + << processesOnNode << ", available GPUs on node: " << deviceCount) + WALBERLA_GPU_CHECK(gpuSetDevice(rankOnNode)) + } + else + { + WALBERLA_LOG_WARNING( + "Too many processes started per node - should be one per GPU. Number of processes per node " + << processesOnNode << ", available GPUs on node " << deviceCount) + WALBERLA_GPU_CHECK(gpuSetDevice(rankOnNode % deviceCount)) + } + } } -#endif } #endif diff --git a/src/gpu/DeviceSelectMPI.h b/src/gpu/DeviceSelectMPI.h index 34d763f93..5ed18edf5 100644 --- a/src/gpu/DeviceSelectMPI.h +++ b/src/gpu/DeviceSelectMPI.h @@ -21,6 +21,7 @@ #pragma once +#include "gpu/DeviceWrapper.h" namespace walberla { namespace gpu diff --git a/src/gpu/DeviceWrapper.h b/src/gpu/DeviceWrapper.h new file mode 100644 index 000000000..64590bd4a --- /dev/null +++ b/src/gpu/DeviceWrapper.h @@ -0,0 +1,292 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file DeviceWrapper.h +//! \ingroup gpu +//! \author Richard Angersbach <richard.angersbach@fau.de> +// +//====================================================================================================================== + +#pragma once + +/// \cond internal + +#include <sstream> +#include "core/Abort.h" + +// CMake generated header +#include "waLBerlaDefinitions.h" + +// DEVICE SECTION // + +#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT) + +#define WALBERLA_DEVICE_SECTION() if (true) +#define WALBERLA_NON_DEVICE_SECTION() if (false) + +#else + +#define WALBERLA_DEVICE_SECTION() if (false) +#define WALBERLA_NON_DEVICE_SECTION() if (true) + +#endif + +namespace walberla { +namespace gpustubs { + // empty namespace which can be used +} // namespace gpustubs +} // namespace walberla + +#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT) + +// include runtime header +#include "gpu/GPUWrapper.h" + +#else // WALBERLA_BUILD_WITH_GPU_SUPPORT + +namespace walberla { +namespace gpustubs { + +// dummy definitions for CUDA/HIP data types and functions in order to guarantee successful compilation without CUDA/HIP enabled + +#define WALBERLA_DEVICE_FUNCTION_ERROR \ + WALBERLA_ABORT("Invalid device function call! In case of compiling without CUDA/HIP, functions are not " \ + "available and shouldn't be called!"); + +#ifndef __CUDACC__ + #define __device__ + #define __global__ + #define __host__ + #define __forceinline__ +#endif + +using gpuError_t = int; +const gpuError_t gpuSuccess = 0; + +#define gpuHostAllocDefault 0x00 +#define gpuHostAllocMapped 0x02 +#define gpuHostAllocPortable 0x01 +#define gpuHostAllocWriteCombined 0x04 + +using gpuMemcpyKind = int; +const gpuMemcpyKind gpuMemcpyHostToHost = 0; +const gpuMemcpyKind gpuMemcpyHostToDevice = 1; +const gpuMemcpyKind gpuMemcpyDeviceToHost = 2; +const gpuMemcpyKind gpuMemcpyDeviceToDevice = 3; +const gpuMemcpyKind gpuMemcpyDefault = 4; + +inline const char* gpuGetErrorName(gpuError_t /*code*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline const char* gpuGetErrorString(gpuError_t /*code*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuGetLastError(void) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuPeekAtLastError(void) { WALBERLA_DEVICE_FUNCTION_ERROR } + +inline gpuError_t gpuMalloc(void** /*devPtr*/, size_t /*size*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuMallocHost(void** /*ptr*/, size_t /*size*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuHostAlloc(void** /*pHost*/, size_t /*size*/, unsigned int /*flags*/) { WALBERLA_DEVICE_FUNCTION_ERROR } + +struct gpuPos +{ + size_t x, y, z; +}; + +struct gpuPitchedPtr +{ + size_t pitch; + void* ptr; + size_t xsize; + size_t ysize; +}; + +struct gpuExtent +{ + size_t depth; + size_t height; + size_t width; +}; + +struct gpuArray; +typedef struct gpuArray* gpuArray_t; +typedef struct gpuArray* gpuArray_const_t; + +struct CUstream_st; +typedef struct CUstream_st* gpuStream_t; +inline gpuError_t gpuStreamDestroy(gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuStreamCreateWithPriority(gpuStream_t* /*pStream*/, unsigned int /*flags*/, int /*priority*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuStreamCreateWithFlags(gpuStream_t* /*pStream*/, unsigned int /*flags*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuDeviceGetStreamPriorityRange(int* /*leastPriority*/, int* /*greatestPriority*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuStreamCreate(gpuStream_t* /*pStream*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuStreamSynchronize(gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR } + +struct gpuMemcpy3DParms +{ + gpuArray_t dstArray; + gpuPos dstPos; + gpuPitchedPtr dstPtr; + gpuExtent extent; + gpuMemcpyKind kind; + gpuArray_t srcArray; + gpuPos srcPos; + gpuPitchedPtr srcPtr; +}; + +inline gpuError_t gpuMemcpy(void* /*dst*/, const void* /*src*/, size_t /*count*/, gpuMemcpyKind /*kind*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuMemcpyAsync(void* /*dst*/, const void* /*src*/, size_t /*count*/, gpuMemcpyKind /*kind*/, gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuMemcpy3D(const gpuMemcpy3DParms* /*p*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuMemcpy3DAsync(const gpuMemcpy3DParms* /*p*/, gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR } + +inline gpuPos make_gpuPos(size_t /*x*/, size_t /*y*/, size_t /*z*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuPitchedPtr make_gpuPitchedPtr (void* /*d*/, size_t /*p*/, size_t /*xsz*/, size_t /*ysz*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuExtent make_gpuExtent(size_t /*w*/, size_t /*h*/, size_t /*d*/) { WALBERLA_DEVICE_FUNCTION_ERROR } + +inline gpuError_t gpuFree(void* /*devPtr*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuFreeHost(void* /*ptr*/) { WALBERLA_DEVICE_FUNCTION_ERROR } + +inline gpuError_t gpuDeviceSynchronize(void) { WALBERLA_DEVICE_FUNCTION_ERROR } + +struct CUevent_st; +typedef struct CUevent_st* gpuEvent_t; +inline gpuError_t gpuEventCreate(gpuEvent_t* /*event*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuEventCreateWithFlags(gpuEvent_t* /*event*/, unsigned int /*flags*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuEventRecord(gpuEvent_t /*event*/, gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuEventDestroy(gpuEvent_t /*event*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuStreamWaitEvent (gpuStream_t /*stream*/, gpuEvent_t /*event*/, unsigned int /*flags*/) { WALBERLA_DEVICE_FUNCTION_ERROR } + +#define gpuStreamDefault 0x00 + +inline gpuError_t gpuGetDeviceCount(int* /*count*/) { WALBERLA_DEVICE_FUNCTION_ERROR } +inline gpuError_t gpuSetDevice(int /*device*/) { WALBERLA_DEVICE_FUNCTION_ERROR } + +struct CUuuid_st +{ + char bytes; +}; +typedef struct CUuuid_st gpuUUID_t; + +struct gpuDeviceProp +{ + char name[256]; + gpuUUID_t uuid; + size_t totalGlobalMem; + size_t sharedMemPerBlock; + int regsPerBlock; + int warpSize; + size_t memPitch; + int maxThreadsPerBlock; + int maxThreadsDim[3]; + int maxGridSize[3]; + int clockRate; + size_t totalConstMem; + int major; + int minor; + size_t textureAlignment; + size_t texturePitchAlignment; + int deviceOverlap; + int multiProcessorCount; + int kernelExecTimeoutEnabled; + int integrated; + int canMapHostMemory; + int computeMode; + int maxTexture1D; + int maxTexture1DMipmap; + int maxTexture1DLinear; + int maxTexture2D[2]; + int maxTexture2DMipmap[2]; + int maxTexture2DLinear[3]; + int maxTexture2DGather[2]; + int maxTexture3D[3]; + int maxTexture3DAlt[3]; + int maxTextureCubemap; + int maxTexture1DLayered[2]; + int maxTexture2DLayered[3]; + int maxTextureCubemapLayered[2]; + int maxSurface1D; + int maxSurface2D[2]; + int maxSurface3D[3]; + int maxSurface1DLayered[2]; + int maxSurface2DLayered[3]; + int maxSurfaceCubemap; + int maxSurfaceCubemapLayered[2]; + size_t surfaceAlignment; + int concurrentKernels; + int ECCEnabled; + int pciBusID; + int pciDeviceID; + int pciDomainID; + int tccDriver; + int asyncEngineCount; + int unifiedAddressing; + int memoryClockRate; + int memoryBusWidth; + int l2CacheSize; + int persistingL2CacheMaxSize; + int maxThreadsPerMultiProcessor; + int streamPrioritiesSupported; + int globalL1CacheSupported; + int localL1CacheSupported; + size_t sharedMemPerMultiprocessor; + int regsPerMultiprocessor; + int managedMemory; + int isMultiGpuBoard; + int multiGpuBoardGroupID; + int singleToDoublePrecisionPerfRatio; + int pageableMemoryAccess; + int concurrentManagedAccess; + int computePreemptionSupported; + int canUseHostPointerForRegisteredMem; + int cooperativeLaunch; + int cooperativeMultiDeviceLaunch; + int pageableMemoryAccessUsesHostPageTables; + int directManagedMemAccessFromHost; + int accessPolicyMaxWindowSize; +}; +inline gpuError_t gpuGetDeviceProperties(gpuDeviceProp* /*prop*/, int /*device*/) { WALBERLA_DEVICE_FUNCTION_ERROR } + +struct uint3 +{ + unsigned int x, y, z; +}; +typedef struct uint3 uint3; + +struct dim3 +{ + unsigned int x, y, z; + dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {} + dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {} + operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; } +}; +typedef struct dim3 dim3; + +inline gpuError_t gpuLaunchKernel(const void* /*func*/, dim3 /*gridDim*/, dim3 /*blockDim*/, void** /*args*/, size_t /*sharedMem*/, gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR } + +#ifdef _WIN32 +#define GPURT_CB __stdcall +#else +#define GPURT_CB +#endif + +typedef void(GPURT_CB* gpuHostFn_t)(void* /*userData*/); +inline gpuError_t gpuLaunchHostFunc(gpuStream_t /*stream*/, gpuHostFn_t /*fn*/, void* /*userData*/) { WALBERLA_DEVICE_FUNCTION_ERROR } + +#undef WALBERLA_DEVICE_FUNCTION_ERROR + +} // namespace gpustubs +using namespace gpustubs; + +} // namespace walberla + + +#endif // WALBERLA_BUILD_WITH_GPU_SUPPORT + +/// \endcond diff --git a/src/gpu/ErrorChecking.h b/src/gpu/ErrorChecking.h index a80ef0334..7031a936f 100644 --- a/src/gpu/ErrorChecking.h +++ b/src/gpu/ErrorChecking.h @@ -25,14 +25,14 @@ #include <sstream> -#include "gpu/GPUWrapper.h" +#include "gpu/DeviceWrapper.h" namespace walberla { namespace gpu { #define WALBERLA_GPU_CHECK(ans) { ::walberla::gpu::checkForError((ans), __FILE__, __LINE__); } -#define WALBERLA_GPU_CHECK_LAST_ERROR() {::walberla::gpu::checkForLastError(__FILE__, __LINE__);} +#define WALBERLA_GPU_CHECK_LAST_ERROR() { ::walberla::gpu::checkForLastError(__FILE__, __LINE__); } @@ -56,7 +56,7 @@ inline void checkForLastError( const std::string & callerPath, const int line ) if(code != gpuSuccess) { std::stringstream ss; - ss << "CUDA Error: " << code << " " << cudaGetErrorName(code) << ": " << cudaGetErrorString( code ); + ss << "CUDA Error: " << code << " " << gpuGetErrorName(code) << ": " << gpuGetErrorString( code ); Abort::instance()->abort( ss.str(), callerPath, line ); } } diff --git a/src/gpu/FieldAccessor.h b/src/gpu/FieldAccessor.h index 798440d12..cd50cc58d 100644 --- a/src/gpu/FieldAccessor.h +++ b/src/gpu/FieldAccessor.h @@ -22,8 +22,8 @@ #pragma once #include "core/DataTypes.h" - #include "gpu/GPUWrapper.h" +#include "gpu/DeviceWrapper.h" namespace walberla { namespace gpu diff --git a/src/gpu/FieldCopy.h b/src/gpu/FieldCopy.h index 13c079074..6895661ec 100644 --- a/src/gpu/FieldCopy.h +++ b/src/gpu/FieldCopy.h @@ -98,56 +98,48 @@ namespace gpu template<typename T, uint_t fs> void fieldCpy(gpu::GPUField<T> & dst, const field::Field<T,fs> & src ) { - gpuMemcpy3DParms p; - memset( &p, 0, sizeof(p) ); - - - if ( dst.layout() != src.layout() ) { - WALBERLA_ABORT( "Cannot copy fields with different layout" ) - } - - bool canCopy = ( src.layout() == fzyx && - dst.fAllocSize() == src.fAllocSize() && - dst.zAllocSize() == src.zAllocSize() && - dst.yAllocSize() == src.yAllocSize() && - dst.xSize() == src.xSize() ) - || - ( src.layout() == zyxf && - dst.zAllocSize() == src.zAllocSize() && - dst.yAllocSize() == src.yAllocSize() && - dst.xAllocSize() == src.xAllocSize() && - dst.fSize() == src.fSize() ); - - if ( !canCopy ) { - WALBERLA_ABORT("Field have to have the same size ") - } - - if ( dst.layout() == fzyx ) - { - p.srcPtr = make_gpuPitchedPtr( (void*)(src.data()), // pointer - sizeof(T) * src.xAllocSize(), // pitch - src.xAllocSize(), // inner dimension size - src.yAllocSize() ); // next outer dimension size - - p.extent.width = std::min( dst.xAllocSize(), src.xAllocSize() ) * sizeof(T); - p.extent.height = dst.yAllocSize(); - p.extent.depth = dst.zAllocSize() * dst.fAllocSize(); - } - else + WALBERLA_DEVICE_SECTION() { - p.srcPtr = make_gpuPitchedPtr( (void*)(src.data()), // pointer - sizeof(T) * src.fAllocSize(), // pitch - src.fAllocSize(), // inner dimension size - src.xAllocSize() ); // next outer dimension size - - p.extent.width = std::min( dst.fAllocSize(), src.fAllocSize() ) * sizeof(T); - p.extent.height = dst.xAllocSize(); - p.extent.depth = dst.yAllocSize() * dst.zAllocSize(); + gpuMemcpy3DParms p; + memset(&p, 0, sizeof(p)); + + if (dst.layout() != src.layout()) { WALBERLA_ABORT("Cannot copy fields with different layout") } + + bool canCopy = + (src.layout() == fzyx && dst.fAllocSize() == src.fAllocSize() && dst.zAllocSize() == src.zAllocSize() && + dst.yAllocSize() == src.yAllocSize() && dst.xSize() == src.xSize()) || + (src.layout() == zyxf && dst.zAllocSize() == src.zAllocSize() && dst.yAllocSize() == src.yAllocSize() && + dst.xAllocSize() == src.xAllocSize() && dst.fSize() == src.fSize()); + + if (!canCopy) { WALBERLA_ABORT("Field have to have the same size ") } + + if (dst.layout() == fzyx) + { + p.srcPtr = make_gpuPitchedPtr((void*) (src.data()), // pointer + sizeof(T) * src.xAllocSize(), // pitch + src.xAllocSize(), // inner dimension size + src.yAllocSize()); // next outer dimension size + + p.extent.width = std::min(dst.xAllocSize(), src.xAllocSize()) * sizeof(T); + p.extent.height = dst.yAllocSize(); + p.extent.depth = dst.zAllocSize() * dst.fAllocSize(); + } + else + { + p.srcPtr = make_gpuPitchedPtr((void*) (src.data()), // pointer + sizeof(T) * src.fAllocSize(), // pitch + src.fAllocSize(), // inner dimension size + src.xAllocSize()); // next outer dimension size + + p.extent.width = std::min(dst.fAllocSize(), src.fAllocSize()) * sizeof(T); + p.extent.height = dst.xAllocSize(); + p.extent.depth = dst.yAllocSize() * dst.zAllocSize(); + } + + p.dstPtr = dst.pitchedPtr(); + p.kind = gpuMemcpyHostToDevice; + WALBERLA_GPU_CHECK(gpuMemcpy3D(&p)) } - - p.dstPtr = dst.pitchedPtr(); - p.kind = gpuMemcpyHostToDevice; - WALBERLA_GPU_CHECK( gpuMemcpy3D( &p ) ) } @@ -155,56 +147,48 @@ namespace gpu template<typename T, uint_t fs> void fieldCpy( field::Field<T,fs> & dst, const gpu::GPUField<T> & src ) { - gpuMemcpy3DParms p; - memset( &p, 0, sizeof(p) ); - - if ( dst.layout() != src.layout() ) { - WALBERLA_ABORT( "Cannot copy fields with different layout" ) - } - - bool canCopy = ( src.layout() == fzyx && - dst.fAllocSize() == src.fAllocSize() && - dst.zAllocSize() == src.zAllocSize() && - dst.yAllocSize() == src.yAllocSize() && - dst.xSize() == src.xSize() ) - || - ( src.layout() == zyxf && - dst.zAllocSize() == src.zAllocSize() && - dst.yAllocSize() == src.yAllocSize() && - dst.xAllocSize() == src.xAllocSize() && - dst.fSize() == src.fSize() ); - - if ( !canCopy ) { - WALBERLA_ABORT("Field have to have the same size ") - } - - if ( dst.layout() == fzyx ) + WALBERLA_DEVICE_SECTION() { - p.dstPtr = make_gpuPitchedPtr( (void*)(dst.data()), // pointer - sizeof(T) * dst.xAllocSize(), // pitch - dst.xAllocSize(), // inner dimension size - dst.yAllocSize() ); // next outer dimension size - - p.extent.width = std::min( dst.xAllocSize(), src.xAllocSize() ) * sizeof(T); - p.extent.height = dst.yAllocSize(); - p.extent.depth = dst.zAllocSize() * dst.fAllocSize(); + gpuMemcpy3DParms p; + memset(&p, 0, sizeof(p)); + + if (dst.layout() != src.layout()) { WALBERLA_ABORT("Cannot copy fields with different layout") } + + bool canCopy = + (src.layout() == fzyx && dst.fAllocSize() == src.fAllocSize() && dst.zAllocSize() == src.zAllocSize() && + dst.yAllocSize() == src.yAllocSize() && dst.xSize() == src.xSize()) || + (src.layout() == zyxf && dst.zAllocSize() == src.zAllocSize() && dst.yAllocSize() == src.yAllocSize() && + dst.xAllocSize() == src.xAllocSize() && dst.fSize() == src.fSize()); + + if (!canCopy) { WALBERLA_ABORT("Field have to have the same size ") } + + if (dst.layout() == fzyx) + { + p.dstPtr = make_gpuPitchedPtr((void*) (dst.data()), // pointer + sizeof(T) * dst.xAllocSize(), // pitch + dst.xAllocSize(), // inner dimension size + dst.yAllocSize()); // next outer dimension size + + p.extent.width = std::min(dst.xAllocSize(), src.xAllocSize()) * sizeof(T); + p.extent.height = dst.yAllocSize(); + p.extent.depth = dst.zAllocSize() * dst.fAllocSize(); + } + else + { + p.dstPtr = make_gpuPitchedPtr((void*) (dst.data()), // pointer + sizeof(T) * dst.fAllocSize(), // pitch + dst.fAllocSize(), // inner dimension size + dst.xAllocSize()); // next outer dimension size + + p.extent.width = std::min(dst.fAllocSize(), src.fAllocSize()) * sizeof(T); + p.extent.height = dst.xAllocSize(); + p.extent.depth = dst.yAllocSize() * dst.zAllocSize(); + } + + p.srcPtr = src.pitchedPtr(); + p.kind = gpuMemcpyDeviceToHost; + WALBERLA_GPU_CHECK(gpuMemcpy3D(&p)) } - else - { - p.dstPtr = make_gpuPitchedPtr( (void*)(dst.data()), // pointer - sizeof(T) * dst.fAllocSize(), // pitch - dst.fAllocSize(), // inner dimension size - dst.xAllocSize() ); // next outer dimension size - - p.extent.width = std::min( dst.fAllocSize(), src.fAllocSize() ) * sizeof(T); - p.extent.height = dst.xAllocSize(); - p.extent.depth = dst.yAllocSize() * dst.zAllocSize(); - } - - p.srcPtr = src.pitchedPtr(); - p.kind = gpuMemcpyDeviceToHost; - WALBERLA_GPU_CHECK( gpuMemcpy3D( &p ) ) - } } // namespace gpu diff --git a/src/gpu/FieldIndexing.h b/src/gpu/FieldIndexing.h index c11953e4f..51b337e61 100644 --- a/src/gpu/FieldIndexing.h +++ b/src/gpu/FieldIndexing.h @@ -23,6 +23,7 @@ #pragma once #include "stencil/Directions.h" +#include "gpu/DeviceWrapper.h" #include "FieldAccessor.h" diff --git a/src/gpu/FieldIndexing.impl.h b/src/gpu/FieldIndexing.impl.h index 922a48b9b..a8c9feccf 100644 --- a/src/gpu/FieldIndexing.impl.h +++ b/src/gpu/FieldIndexing.impl.h @@ -44,17 +44,21 @@ FieldIndexing<T>::FieldIndexing ( const GPUField<T> & field, { WALBERLA_DEBUG_SECTION() { - gpuDeviceProp prop; - int count; - gpuGetDeviceCount(&count); - int threadsPerBlock = std::numeric_limits<int>::max(); - for (int i = 0; i < count; i++) { - gpuGetDeviceProperties(&prop, i); - threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock ); + WALBERLA_DEVICE_SECTION() + { + gpuDeviceProp prop; + int count; + gpuGetDeviceCount(&count); + int threadsPerBlock = std::numeric_limits< int >::max(); + for (int i = 0; i < count; i++) + { + gpuGetDeviceProperties(&prop, i); + threadsPerBlock = std::min(prop.maxThreadsPerBlock, threadsPerBlock); + } + WALBERLA_ASSERT_LESS(int_c(blockDim_.x), threadsPerBlock, + "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " + << "than the maximal thread count per GPU block.") } - WALBERLA_ASSERT_LESS( int_c( blockDim_.x ), threadsPerBlock, - "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " << - "than the maximal thread count per GPU block." ) } } diff --git a/src/gpu/FieldIndexing3D.impl.h b/src/gpu/FieldIndexing3D.impl.h index a8cc922cf..5aa027872 100644 --- a/src/gpu/FieldIndexing3D.impl.h +++ b/src/gpu/FieldIndexing3D.impl.h @@ -52,17 +52,21 @@ FieldIndexing3D<T>::FieldIndexing3D( const GPUField<T> & field, { WALBERLA_DEBUG_SECTION() { - gpuDeviceProp prop; - int count; - gpuGetDeviceCount(&count); - int threadsPerBlock = std::numeric_limits<int>::max(); - for (int i = 0; i < count; i++) { - gpuGetDeviceProperties(&prop, i); - threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock ); + WALBERLA_DEVICE_SECTION() + { + gpuDeviceProp prop; + int count; + gpuGetDeviceCount(&count); + int threadsPerBlock = std::numeric_limits< int >::max(); + for (int i = 0; i < count; i++) + { + gpuGetDeviceProperties(&prop, i); + threadsPerBlock = std::min(prop.maxThreadsPerBlock, threadsPerBlock); + } + WALBERLA_ASSERT_LESS(int_c(blockDim_.x), threadsPerBlock, + "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " + << "than the maximal thread count per GPU block.") } - WALBERLA_ASSERT_LESS( int_c( blockDim_.x ), threadsPerBlock, - "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " << - "than the maximal thread count per GPU block." ) } } diff --git a/src/gpu/FieldIndexingXYZ.h b/src/gpu/FieldIndexingXYZ.h index b6da50d5d..f62161bd1 100644 --- a/src/gpu/FieldIndexingXYZ.h +++ b/src/gpu/FieldIndexingXYZ.h @@ -21,6 +21,9 @@ #pragma once +#include "core/DataTypes.h" + +#include "DeviceWrapper.h" #include "FieldAccessorXYZ.h" namespace walberla { namespace cell { class CellInterval; } } diff --git a/src/gpu/FieldIndexingXYZ.impl.h b/src/gpu/FieldIndexingXYZ.impl.h index 9ec8b6c08..d75560b9d 100644 --- a/src/gpu/FieldIndexingXYZ.impl.h +++ b/src/gpu/FieldIndexingXYZ.impl.h @@ -41,17 +41,21 @@ FieldIndexingXYZ<T>::FieldIndexingXYZ ( const GPUField<T> & field, { WALBERLA_DEBUG_SECTION() { - gpuDeviceProp prop; - int count; - gpuGetDeviceCount(&count); - int threadsPerBlock = std::numeric_limits<int>::max(); - for (int i = 0; i < count; i++) { - gpuGetDeviceProperties(&prop, i); - threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock ); + WALBERLA_DEVICE_SECTION() + { + gpuDeviceProp prop; + int count; + gpuGetDeviceCount(&count); + int threadsPerBlock = std::numeric_limits< int >::max(); + for (int i = 0; i < count; i++) + { + gpuGetDeviceProperties(&prop, i); + threadsPerBlock = std::min(prop.maxThreadsPerBlock, threadsPerBlock); + } + WALBERLA_ASSERT_LESS(int_c(blockDim_.x), threadsPerBlock, + "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " + << "than the maximal thread count per GPU block.") } - WALBERLA_ASSERT_LESS( int_c( blockDim_.x ), threadsPerBlock, - "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " << - "than the maximal thread count per GPU block." ) } } diff --git a/src/gpu/GPUCopy.h b/src/gpu/GPUCopy.h index e74d04ddb..08e94a38a 100644 --- a/src/gpu/GPUCopy.h +++ b/src/gpu/GPUCopy.h @@ -25,7 +25,8 @@ #include "core/DataTypes.h" -#include "ErrorChecking.h" +#include "gpu/ErrorChecking.h" +#include "gpu/DeviceWrapper.h" #include <tuple> diff --git a/src/gpu/GPUField.h b/src/gpu/GPUField.h index ecc9ccc5b..f8a0242ed 100755 --- a/src/gpu/GPUField.h +++ b/src/gpu/GPUField.h @@ -28,7 +28,7 @@ #include "stencil/Directions.h" -#include "gpu/GPUWrapper.h" +#include "gpu/DeviceWrapper.h" namespace walberla { namespace gpu @@ -40,7 +40,7 @@ namespace gpu //******************************************************************************************************************* - /*! GhostLayerField stored on a CUDA GPU + /*! GhostLayerField stored on a CUDA/HIP GPU * * Basically a wrapper around a CUDA/HIP device pointer together with size information about the field * i.e. sizes in x,y,z,f directions and number of ghost layers. @@ -155,7 +155,7 @@ namespace gpu //**************************************************************************************************************** protected: - gpuPitchedPtr pitchedPtr_; + gpuPitchedPtr pitchedPtr_; uint_t nrOfGhostLayers_; uint_t xSize_; uint_t ySize_; diff --git a/src/gpu/GPUField.impl.h b/src/gpu/GPUField.impl.h index 9c1242aa9..dd42d088c 100644 --- a/src/gpu/GPUField.impl.h +++ b/src/gpu/GPUField.impl.h @@ -22,6 +22,7 @@ #include "GPUField.h" #include "ErrorChecking.h" #include "AlignedAllocation.h" +#include "DeviceWrapper.h" #include "core/logging/Logging.h" namespace walberla { @@ -36,40 +37,44 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz xSize_( _xSize), ySize_( _ySize ), zSize_( _zSize ), fSize_( _fSize ), layout_( _layout ), usePitchedMem_( usePitchedMem ), timestepCounter_(0) { + WALBERLA_NON_DEVICE_SECTION() { + WALBERLA_ABORT(__FUNCTION__ << "Instantiating GPU field without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.") + } + gpuExtent extent; - if ( layout_ == zyxf ) + if (layout_ == zyxf) { extent.width = _fSize * sizeof(T); - extent.height = (_xSize + 2 * _nrOfGhostLayers ); - extent.depth = (_ySize + 2 * _nrOfGhostLayers ) * ( _zSize + 2 * _nrOfGhostLayers ); + extent.height = (_xSize + 2 * _nrOfGhostLayers); + extent.depth = (_ySize + 2 * _nrOfGhostLayers) * (_zSize + 2 * _nrOfGhostLayers); } else { - extent.width = (_xSize + 2 * _nrOfGhostLayers ) * sizeof(T); - extent.height = (_ySize + 2 * _nrOfGhostLayers ); - extent.depth = (_zSize + 2 * _nrOfGhostLayers ) * _fSize; + extent.width = (_xSize + 2 * _nrOfGhostLayers) * sizeof(T); + extent.height = (_ySize + 2 * _nrOfGhostLayers); + extent.depth = (_zSize + 2 * _nrOfGhostLayers) * _fSize; } - if ( usePitchedMem_ ) + if (usePitchedMem_) { size_t pitch; const size_t alignment = 256; - void * mem = allocate_pitched_with_offset( pitch, extent.width, extent.height * extent.depth, alignment, - sizeof(T) * nrOfGhostLayers_ ); - WALBERLA_ASSERT_EQUAL( size_t((char*)(mem) + sizeof(T) * nrOfGhostLayers_ ) % alignment, 0 ) - pitchedPtr_ = make_gpuPitchedPtr( mem, pitch, extent.width, extent.height ); + void* mem = allocate_pitched_with_offset(pitch, extent.width, extent.height * extent.depth, alignment, + sizeof(T) * nrOfGhostLayers_); + WALBERLA_ASSERT_EQUAL(size_t((char*) (mem) + sizeof(T) * nrOfGhostLayers_) % alignment, 0) + pitchedPtr_ = make_gpuPitchedPtr(mem, pitch, extent.width, extent.height); } else { - pitchedPtr_ = make_gpuPitchedPtr(nullptr, extent.width, extent.width, extent.height ); - WALBERLA_GPU_CHECK ( gpuMalloc( &pitchedPtr_.ptr, extent.width * extent.height * extent.depth ) ) + pitchedPtr_ = make_gpuPitchedPtr(nullptr, extent.width, extent.width, extent.height); + WALBERLA_GPU_CHECK(gpuMalloc(&pitchedPtr_.ptr, extent.width * extent.height * extent.depth)) } // allocation size is stored in pitched pointer // pitched pointer stores the amount of padded region in bytes // but we keep track of the size in #elements - WALBERLA_ASSERT_EQUAL( pitchedPtr_.pitch % sizeof(T), 0 ) - if ( layout_ == field::fzyx ) + WALBERLA_ASSERT_EQUAL(pitchedPtr_.pitch % sizeof(T), 0) + if (layout_ == field::fzyx) { xAllocSize_ = pitchedPtr_.pitch / sizeof(T); fAllocSize_ = fSize_; diff --git a/src/gpu/HostFieldAllocator.h b/src/gpu/HostFieldAllocator.h index 2b7311add..8b24c3a47 100644 --- a/src/gpu/HostFieldAllocator.h +++ b/src/gpu/HostFieldAllocator.h @@ -22,7 +22,8 @@ #pragma once -#include "ErrorChecking.h" +#include "gpu/ErrorChecking.h" +#include "gpu/DeviceWrapper.h" #include "field/allocation/FieldAllocator.h" @@ -51,22 +52,37 @@ namespace gpu virtual T * allocateMemory ( uint_t size0, uint_t size1, uint_t size2, uint_t size3, uint_t & allocSize1, uint_t & allocSize2, uint_t & allocSize3 ) { + WALBERLA_NON_DEVICE_SECTION() + { + WALBERLA_ABORT(__FUNCTION__ << "Using GPU method without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.") + } + allocSize1=size1; allocSize2=size2; allocSize3=size3; - void * result; - WALBERLA_GPU_CHECK( gpuHostAlloc( &result, size0*size1*size2*size3*sizeof(T), HostAllocFlags ) ) + void * result = nullptr; + WALBERLA_GPU_CHECK(gpuHostAlloc(&result, size0 * size1 * size2 * size3 * sizeof(T), HostAllocFlags)) return (T*)(result); } virtual T * allocateMemory ( uint_t size ) { - T* result; - gpuHostAlloc( &result, size*sizeof(T), HostAllocFlags ); - return result; + WALBERLA_NON_DEVICE_SECTION() + { + WALBERLA_ABORT(__FUNCTION__ << "Using GPU method without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.") + } + + void * result = nullptr; + WALBERLA_GPU_CHECK(gpuHostAlloc(&result, size*sizeof(T), HostAllocFlags)) + return (T*)(result); } - virtual void deallocate(T *& values) {WALBERLA_GPU_CHECK( gpuFreeHost( values ) )} + virtual void deallocate(T *& values) { + WALBERLA_NON_DEVICE_SECTION() { + WALBERLA_ABORT(__FUNCTION__ << "Using GPU method without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.") + } + WALBERLA_GPU_CHECK(gpuFreeHost(values)) + } }; diff --git a/src/gpu/Kernel.h b/src/gpu/Kernel.h index 6fca21052..f6c2eb687 100644 --- a/src/gpu/Kernel.h +++ b/src/gpu/Kernel.h @@ -236,7 +236,10 @@ namespace gpu // .. and launch the kernel static_assert( sizeof(void *) == sizeof(void (*)(void)), "object pointer and function pointer sizes must be equal" ); - WALBERLA_GPU_CHECK( gpuLaunchKernel( (void*) funcPtr_, gridDim_, blockDim_, args.data(), sharedMemSize_, stream ) ) + WALBERLA_DEVICE_SECTION() + { + WALBERLA_GPU_CHECK(gpuLaunchKernel((void*) funcPtr_, gridDim_, blockDim_, args.data(), sharedMemSize_, stream)) + } } diff --git a/src/gpu/NVTX.h b/src/gpu/NVTX.h index 46302f917..86e3e6a3a 100644 --- a/src/gpu/NVTX.h +++ b/src/gpu/NVTX.h @@ -21,11 +21,14 @@ #include "core/DataTypes.h" -#include <string> +#include "DeviceWrapper.h" -#include <nvToolsExt.h> -#include <nvToolsExtCuda.h> -#include <nvToolsExtCudaRt.h> +#if defined(WALBERLA_BUILD_WITH_CUDA) + #include <nvToolsExt.h> + #include <nvToolsExtCuda.h> + #include <nvToolsExtCudaRt.h> + #include <string> +#endif namespace walberla{ namespace gpu @@ -33,20 +36,30 @@ namespace gpu inline void nvtxMarker(const std::string& name, const uint32_t color=0xaaaaaa) { - nvtxEventAttributes_t eventAttrib; - memset(&eventAttrib, 0, NVTX_EVENT_ATTRIB_STRUCT_SIZE); - eventAttrib.version = NVTX_VERSION; - eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; - eventAttrib.colorType = NVTX_COLOR_ARGB; - eventAttrib.color = 0xFF000000 | color; - eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; - eventAttrib.message.ascii = name.c_str(); - nvtxMarkEx(&eventAttrib); +#if defined(WALBERLA_BUILD_WITH_CUDA) + nvtxEventAttributes_t eventAttrib; + memset(&eventAttrib, 0, NVTX_EVENT_ATTRIB_STRUCT_SIZE); + eventAttrib.version = NVTX_VERSION; + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + eventAttrib.colorType = NVTX_COLOR_ARGB; + eventAttrib.color = 0xFF000000 | color; + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; + eventAttrib.message.ascii = name.c_str(); + nvtxMarkEx(&eventAttrib); +#else + WALBERLA_UNUSED(name); + WALBERLA_UNUSED(color); +#endif } inline void nameStream(const cudaStream_t & stream, const std::string & name) { - nvtxNameCudaStreamA(stream, name.c_str()); +#if defined(WALBERLA_BUILD_WITH_CUDA) + nvtxNameCudaStreamA(stream, name.c_str()); +#else + WALBERLA_UNUSED(stream); + WALBERLA_UNUSED(name); +#endif } class NvtxRange @@ -54,21 +67,31 @@ class NvtxRange public: NvtxRange(const std::string & name, const uint32_t color=0xaaaaaa) { - memset(&eventAttrib, 0, NVTX_EVENT_ATTRIB_STRUCT_SIZE); - eventAttrib.version = NVTX_VERSION; - eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; - eventAttrib.colorType = NVTX_COLOR_ARGB; - eventAttrib.color = 0xFF000000 | color; - eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; - eventAttrib.message.ascii = name.c_str(); - nvtxRangePushEx(&eventAttrib); +#if defined(WALBERLA_BUILD_WITH_CUDA) + memset(&eventAttrib, 0, NVTX_EVENT_ATTRIB_STRUCT_SIZE); + eventAttrib.version = NVTX_VERSION; + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + eventAttrib.colorType = NVTX_COLOR_ARGB; + eventAttrib.color = 0xFF000000 | color; + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; + eventAttrib.message.ascii = name.c_str(); + nvtxRangePushEx(&eventAttrib); +#else + WALBERLA_UNUSED(name); + WALBERLA_UNUSED(color); +#endif } + ~NvtxRange() { - nvtxRangePop(); +#if defined(WALBERLA_BUILD_WITH_CUDA) + nvtxRangePop(); +#endif } private: +#if defined(WALBERLA_BUILD_WITH_CUDA) nvtxEventAttributes_t eventAttrib; +#endif }; diff --git a/src/gpu/ParallelStreams.cpp b/src/gpu/ParallelStreams.cpp index 2dffc7f0a..aed66f693 100644 --- a/src/gpu/ParallelStreams.cpp +++ b/src/gpu/ParallelStreams.cpp @@ -21,6 +21,7 @@ #include "gpu/ParallelStreams.h" +#include "gpu/DeviceWrapper.h" namespace walberla { namespace gpu @@ -30,32 +31,40 @@ namespace gpu ParallelSection::ParallelSection(ParallelStreams * parent, gpuStream_t mainStream) : parent_( parent ), mainStream_( mainStream ), counter_( 0 ) { - WALBERLA_GPU_CHECK( gpuEventCreate(&startEvent_) ) - WALBERLA_GPU_CHECK( gpuEventRecord( startEvent_, mainStream_ ) ) + WALBERLA_DEVICE_SECTION() + { + WALBERLA_GPU_CHECK(gpuEventCreate(&startEvent_)) + WALBERLA_GPU_CHECK(gpuEventRecord(startEvent_, mainStream_)) + } } ParallelSection::~ParallelSection() { - synchronize(); - WALBERLA_GPU_CHECK( gpuEventDestroy(startEvent_) ) + WALBERLA_DEVICE_SECTION() + { + synchronize(); + WALBERLA_GPU_CHECK( gpuEventDestroy(startEvent_) ) + } } void ParallelSection::next() { - if( counter_ > 0 ) { - WALBERLA_GPU_CHECK( gpuEventRecord( parent_->events_[counter_ - 1], parent_->sideStreams_[counter_ - 1] ) ) - } - else { - WALBERLA_GPU_CHECK( gpuEventRecord( parent_->mainEvent_, mainStream_ ) ) - } - ++counter_; + WALBERLA_DEVICE_SECTION() + { + if (counter_ > 0) + { + WALBERLA_GPU_CHECK(gpuEventRecord(parent_->events_[counter_ - 1], parent_->sideStreams_[counter_ - 1])) + } + else { WALBERLA_GPU_CHECK(gpuEventRecord(parent_->mainEvent_, mainStream_)) } + ++counter_; - parent_->ensureSize( counter_ ); + parent_->ensureSize(counter_); - WALBERLA_GPU_CHECK( gpuStreamWaitEvent( stream(), startEvent_, 0 )) + WALBERLA_GPU_CHECK(gpuStreamWaitEvent(stream(), startEvent_, 0)) + } } - void ParallelSection::run(const std::function<void( gpuStream_t)> & f) + void ParallelSection::run(const std::function<void(gpuStream_t)> & f) { f( stream() ); next(); @@ -63,18 +72,20 @@ namespace gpu void ParallelSection::synchronize() { - for( uint_t i=0; i < counter_; ++i ) - for( uint_t j=0; j < counter_; ++j ) - { - if( i == j ) - continue; + WALBERLA_DEVICE_SECTION() + { + for (uint_t i = 0; i < counter_; ++i) + for (uint_t j = 0; j < counter_; ++j) + { + if (i == j) continue; - auto & event = i == 0 ? parent_->mainEvent_ : parent_->events_[i - 1]; - gpuStream_t stream = j == 0 ? mainStream_ : parent_->sideStreams_[j - 1]; - WALBERLA_GPU_CHECK( gpuStreamWaitEvent( stream, event, 0 )) - } + auto& event = i == 0 ? parent_->mainEvent_ : parent_->events_[i - 1]; + gpuStream_t stream = j == 0 ? mainStream_ : parent_->sideStreams_[j - 1]; + WALBERLA_GPU_CHECK(gpuStreamWaitEvent(stream, event, 0)) + } - WALBERLA_GPU_CHECK( gpuEventRecord( startEvent_, mainStream_ ) ) + WALBERLA_GPU_CHECK(gpuEventRecord(startEvent_, mainStream_)) + } } gpuStream_t ParallelSection::stream() diff --git a/src/gpu/communication/UniformGPUScheme.impl.h b/src/gpu/communication/UniformGPUScheme.impl.h index 28033d146..8a8616c1e 100644 --- a/src/gpu/communication/UniformGPUScheme.impl.h +++ b/src/gpu/communication/UniformGPUScheme.impl.h @@ -321,7 +321,7 @@ namespace communication { } template< typename Stencil > - std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor(cudaStream_t stream) + std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor(gpuStream_t stream) { return [this, stream]() { wait( stream ); }; } diff --git a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h index 4a082d341..8d95855e6 100644 --- a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h +++ b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h @@ -80,7 +80,7 @@ class BasicRecursiveTimeStepGPU ~BasicRecursiveTimeStepGPU() = default; void operator()() { timestep(0); }; - void addRefinementToTimeLoop(timeloop::SweepTimeloop& timeloop, uint_t level = 0); + void addRefinementToTimeLoop(SweepTimeloop& timeloop, uint_t level = 0); void test(uint_t maxLevel, uint_t level = 0); private: diff --git a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h index f7c5b2878..5bb43c3c8 100644 --- a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h +++ b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h @@ -94,7 +94,7 @@ void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollectio template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > -void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::addRefinementToTimeLoop(timeloop::SweepTimeloop & timeloop, uint_t level) +void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::addRefinementToTimeLoop(SweepTimeloop & timeloop, uint_t level) { // 1.1 Collision timeloop.addFuncBeforeTimeStep(executeStreamCollideOnLevel(level), "Refinement Cycle: streamCollide on level " + std::to_string(level)); diff --git a/src/lbm_generated/refinement/BasicRecursiveTimeStep.h b/src/lbm_generated/refinement/BasicRecursiveTimeStep.h index 6b0a2a7ec..7c6fdc828 100644 --- a/src/lbm_generated/refinement/BasicRecursiveTimeStep.h +++ b/src/lbm_generated/refinement/BasicRecursiveTimeStep.h @@ -71,7 +71,7 @@ class BasicRecursiveTimeStep }; void operator() () { timestep(0); }; - void addRefinementToTimeLoop(timeloop::SweepTimeloop & timeloop, uint_t level=0); + void addRefinementToTimeLoop(SweepTimeloop & timeloop, uint_t level=0); private: void timestep(uint_t level); diff --git a/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h index 7e6d9b594..29f7de765 100644 --- a/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h +++ b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h @@ -90,7 +90,7 @@ void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > -void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::addRefinementToTimeLoop(timeloop::SweepTimeloop & timeloop, uint_t level) +void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::addRefinementToTimeLoop(SweepTimeloop & timeloop, uint_t level) { // 1.1 Collision timeloop.addFuncBeforeTimeStep(executeStreamCollideOnLevel(level), "Refinement Cycle: streamCollide on level " + std::to_string(level)); diff --git a/src/timeloop/CMakeLists.txt b/src/timeloop/CMakeLists.txt index 9035c3d9a..ba2ef178c 100644 --- a/src/timeloop/CMakeLists.txt +++ b/src/timeloop/CMakeLists.txt @@ -14,8 +14,6 @@ target_sources( timeloop SweepTimeloop.h Timeloop.h PerformanceMeter.cpp - SweepTimeloop.cpp - Timeloop.cpp ) ################################################################################################### diff --git a/src/timeloop/SelectableFunctionCreators.h b/src/timeloop/SelectableFunctionCreators.h index b014f44de..b877b293e 100644 --- a/src/timeloop/SelectableFunctionCreators.h +++ b/src/timeloop/SelectableFunctionCreators.h @@ -186,7 +186,7 @@ namespace timeloop { private: - friend class SweepTimeloop; + template < typename TimingPolicy > friend class SweepTimeloop; BlockStorage & bs_; diff --git a/src/timeloop/SweepTimeloop.h b/src/timeloop/SweepTimeloop.h index 4ffc8df9e..19e9344a6 100644 --- a/src/timeloop/SweepTimeloop.h +++ b/src/timeloop/SweepTimeloop.h @@ -112,7 +112,8 @@ namespace timeloop { * \ingroup timeloop */ //******************************************************************************************************************* - class SweepTimeloop : public Timeloop + template < typename TP = timing::WcPolicy> + class SweepTimeloop : public Timeloop<TP> { public: @@ -121,11 +122,11 @@ namespace timeloop { //@{ SweepTimeloop( BlockStorage & blockStorage, uint_t nrOfTimeSteps ) - : Timeloop(nrOfTimeSteps), blockStorage_(blockStorage), nextId_(0),firstRun_(true) + : Timeloop<TP>(nrOfTimeSteps), blockStorage_(blockStorage), nextId_(0),firstRun_(true) {} SweepTimeloop( const shared_ptr<StructuredBlockStorage> & structuredBlockStorage, uint_t nrOfTimeSteps ) - : Timeloop(nrOfTimeSteps), blockStorage_( structuredBlockStorage->getBlockStorage() ), + : Timeloop<TP>(nrOfTimeSteps), blockStorage_( structuredBlockStorage->getBlockStorage() ), nextId_(0), firstRun_(true) {} @@ -167,7 +168,7 @@ namespace timeloop { } void doTimeStep(const Set<SUID> &selectors) override; - void doTimeStep(const Set<SUID> &selectors, WcTimingPool &tp) override; + void doTimeStep(const Set<SUID> &selectors, timing::TimingPool<TP> &tp) override; uint_t nextId_; std::vector<uint_t> sweepsToDelete_; @@ -180,7 +181,7 @@ namespace timeloop { } // namespace timeloop } // namespace walberla - +#include "SweepTimeloop.impl.h" //====================================================================================================================== // @@ -189,7 +190,8 @@ namespace timeloop { //====================================================================================================================== namespace walberla { - using timeloop::SweepTimeloop; + using SweepTimeloop = typename timeloop::SweepTimeloop < >; + using DeviceSynchronizeSweepTimeloop = typename timeloop::SweepTimeloop < timing::DeviceSynchronizePolicy >; using timeloop::Sweep; using timeloop::SweepOnBlock; diff --git a/src/timeloop/SweepTimeloop.cpp b/src/timeloop/SweepTimeloop.impl.h similarity index 90% rename from src/timeloop/SweepTimeloop.cpp rename to src/timeloop/SweepTimeloop.impl.h index 5721c51c7..481ddbaca 100644 --- a/src/timeloop/SweepTimeloop.cpp +++ b/src/timeloop/SweepTimeloop.impl.h @@ -32,8 +32,8 @@ namespace timeloop { ////////////////////////// Execution of Timeloop //////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -void SweepTimeloop::doTimeStep(const Set<SUID> &selectors) +template < typename TP > +void SweepTimeloop<TP>::doTimeStep(const Set<SUID> &selectors) { removeForDeletionMarkedSweeps(); //iterate over all registered sweeps @@ -43,7 +43,7 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors) //select and execute before functions for( size_t j=0; j < s.beforeFuncs.size(); ++j ) - executeSelectable(s.beforeFuncs[j].selectableFunc_,selectors,"Pre-Sweep Function"); + this->executeSelectable(s.beforeFuncs[j].selectableFunc_, selectors, "Pre-Sweep Function"); // Loop over all blocks for( BlockStorage::iterator bi = blockStorage_.begin(); bi != blockStorage_.end(); ++bi ) @@ -81,11 +81,12 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors) // select and execute after functions for( size_t j=0; j < s.afterFuncs.size(); ++j ) - executeSelectable(s.afterFuncs[j].selectableFunc_,selectors,"Post-Sweep Function"); + this->executeSelectable(s.afterFuncs[j].selectableFunc_, selectors, "Post-Sweep Function"); } } -void SweepTimeloop::doTimeStep(const Set<SUID> &selectors, WcTimingPool &timing) +template < typename TP > +void SweepTimeloop<TP>::doTimeStep(const Set<SUID> &selectors, timing::TimingPool<TP> &timing) { removeForDeletionMarkedSweeps(); // On first run we extract all possible names of sweeps, independent of selectors @@ -113,7 +114,7 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors, WcTimingPool &timing) //select and execute before functions for( size_t j=0; j < s.beforeFuncs.size(); ++j ) - executeSelectable( s.beforeFuncs[j].selectableFunc_, selectors, "Pre-Sweep Function", timing ); + this->executeSelectable( s.beforeFuncs[j].selectableFunc_, selectors, "Pre-Sweep Function", timing ); for( BlockStorage::iterator bi = blockStorage_.begin(); bi != blockStorage_.end(); ++bi ) { @@ -149,7 +150,7 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors, WcTimingPool &timing) // select and execute after functions for( size_t j=0; j < s.afterFuncs.size(); ++j ) - executeSelectable(s.afterFuncs[j].selectableFunc_,selectors,"Post-Sweep Function", timing ); + this->executeSelectable(s.afterFuncs[j].selectableFunc_,selectors,"Post-Sweep Function", timing ); } } diff --git a/src/timeloop/Timeloop.h b/src/timeloop/Timeloop.h index 48b7de6ce..faed83b06 100644 --- a/src/timeloop/Timeloop.h +++ b/src/timeloop/Timeloop.h @@ -48,6 +48,8 @@ using VoidFctNoArguments = std::function<void ()>; * \ingroup timeloop */ //******************************************************************************************************************* + +template < typename TP = timing::WcPolicy > class Timeloop : public ITimeloop { private: @@ -109,7 +111,10 @@ public: //**Construction & Destruction************************************************************************************ /*! \name Construction & Destruction */ //@{ - Timeloop( uint_t nrOfTimeSteps ); + Timeloop( uint_t nrOfTimeSteps ) + : curTimeStep_(0), nrOfTimeSteps_(nrOfTimeSteps), stop_( false ) + { + } ~Timeloop() override = default; //@} @@ -121,17 +126,17 @@ public: //@{ void run() override { run(true); } void run( const bool logTimeStep ); - void run( WcTimingPool & timing, const bool logTimeStep = true ); + void run( timing::TimingPool<TP> & timing, const bool logTimeStep = true ); void singleStep() override { singleStep(true); } void singleStep( const bool logTimeStep ); - void singleStep( WcTimingPool & timing, const bool logTimeStep = true ); + void singleStep( timing::TimingPool<TP> & timing, const bool logTimeStep = true ); void stop() override; void synchronizedStop( bool stop ) override; void setCurrentTimeStepToZero() { curTimeStep_ = 0; } - void setCurrentTimeStep( uint_t ts) override { curTimeStep_ = ts; } + void setCurrentTimeStep( uint_t ts ) override { curTimeStep_ = ts; } //@} //**************************************************************************************************************** @@ -183,7 +188,7 @@ public: protected: virtual void doTimeStep(const Set<SUID> &selectors) = 0; - virtual void doTimeStep(const Set<SUID> &selectors, WcTimingPool &timing) = 0; + virtual void doTimeStep(const Set<SUID> &selectors, timing::TimingPool<TP> &timing) = 0; void executeSelectable(const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable, @@ -192,7 +197,7 @@ protected: void executeSelectable(const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable, const Set<SUID> & selector, const std::string & what, - WcTimingPool & tp); + timing::TimingPool<TP> & tp); uint_t curTimeStep_; ///< current time step @@ -210,6 +215,8 @@ protected: } // namespace timeloop } // namespace walberla +#include "Timeloop.impl.h" + //====================================================================================================================== @@ -219,6 +226,7 @@ protected: //====================================================================================================================== namespace walberla { - using timeloop::Timeloop; + using Timeloop = typename timeloop::Timeloop < >; + using DeviceSynchronizeTimeloop = typename timeloop::Timeloop < timing::DeviceSynchronizePolicy >; } diff --git a/src/timeloop/Timeloop.cpp b/src/timeloop/Timeloop.impl.h similarity index 83% rename from src/timeloop/Timeloop.cpp rename to src/timeloop/Timeloop.impl.h index fd46e16c1..832f1c7ad 100644 --- a/src/timeloop/Timeloop.cpp +++ b/src/timeloop/Timeloop.impl.h @@ -29,16 +29,8 @@ namespace walberla { namespace timeloop { - -Timeloop::Timeloop( uint_t nrOfTimeSteps) - : curTimeStep_(0), nrOfTimeSteps_(nrOfTimeSteps), stop_( false ) -{ -} - - - - -void Timeloop::run( const bool logTimeStep ) +template < typename TP > +void Timeloop<TP>::run( const bool logTimeStep ) { WALBERLA_LOG_PROGRESS( "Running timeloop for " << nrOfTimeSteps_ << " time steps" ) while(curTimeStep_ < nrOfTimeSteps_) { @@ -51,7 +43,8 @@ void Timeloop::run( const bool logTimeStep ) WALBERLA_LOG_PROGRESS( "Timeloop finished" ) } -void Timeloop::run( WcTimingPool & tp, const bool logTimeStep ) +template < typename TP > +void Timeloop<TP>::run(timing::TimingPool<TP> & tp, const bool logTimeStep ) { WALBERLA_LOG_PROGRESS( "Running timeloop for " << nrOfTimeSteps_ << " time steps" ) @@ -74,7 +67,9 @@ void Timeloop::run( WcTimingPool & tp, const bool logTimeStep ) * before reaching nrOfTimeSteps */ //******************************************************************************************************************* -void Timeloop::stop() + +template < typename TP > +void Timeloop<TP>::stop() { stop_ = true; } @@ -89,13 +84,16 @@ void Timeloop::stop() * -> If at least on process calls synchronizedStop(true) the timeloop is stopped */ //******************************************************************************************************************* -void Timeloop::synchronizedStop( bool stopVal ) + +template < typename TP > +void Timeloop<TP>::synchronizedStop( bool stopVal ) { stop_ = stopVal; mpi::allReduceInplace( stop_, mpi::LOGICAL_OR ); } -void Timeloop::singleStep( const bool logTimeStep ) +template < typename TP > +void Timeloop<TP>::singleStep( const bool logTimeStep ) { LoggingStampManager const raii( make_shared<LoggingStamp>( *this ), logTimeStep ); @@ -112,7 +110,8 @@ void Timeloop::singleStep( const bool logTimeStep ) ++curTimeStep_; } -void Timeloop::singleStep( WcTimingPool & tp, const bool logTimeStep ) +template < typename TP > +void Timeloop<TP>::singleStep( timing::TimingPool<TP> & tp, const bool logTimeStep ) { LoggingStampManager const raii( make_shared<LoggingStamp>( *this ), logTimeStep ); @@ -133,17 +132,17 @@ void Timeloop::singleStep( WcTimingPool & tp, const bool logTimeStep ) ////////////////////////////////////////// Registering Functions /////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -Timeloop::FctHandle -Timeloop::addFuncBeforeTimeStep(const VoidFctNoArguments& f, const std::string & id, +template < typename TP > +typename Timeloop<TP>::FctHandle +Timeloop<TP>::addFuncBeforeTimeStep(const VoidFctNoArguments& f, const std::string & id, const Set<SUID> & r, const Set<SUID> & e ) { beforeFunctions_.emplace_back(f,r,e,id ); return beforeFunctions_.size() - 1; } - -void Timeloop::addFuncBeforeTimeStep(const Timeloop::FctHandle & h, +template < typename TP > +void Timeloop<TP>::addFuncBeforeTimeStep(const Timeloop::FctHandle & h, const VoidFctNoArguments& f, const std::string & id, const Set<SUID>&r, const Set<SUID> & e ) { @@ -152,17 +151,17 @@ void Timeloop::addFuncBeforeTimeStep(const Timeloop::FctHandle & h, } - -Timeloop::FctHandle -Timeloop::addFuncAfterTimeStep(const VoidFctNoArguments& f, const std::string & id, +template < typename TP > +typename Timeloop<TP>::FctHandle +Timeloop<TP>::addFuncAfterTimeStep(const VoidFctNoArguments& f, const std::string & id, const Set<SUID> & r, const Set<SUID> & e ) { afterFunctions_.emplace_back(f,r,e,id ); return afterFunctions_.size() - 1; } - -void Timeloop::addFuncAfterTimeStep(const Timeloop::FctHandle & h, +template < typename TP > +void Timeloop<TP>::addFuncAfterTimeStep(const Timeloop::FctHandle & h, const VoidFctNoArguments& f, const std::string & id, const Set<SUID>&r, const Set<SUID> & e ) { @@ -172,8 +171,8 @@ void Timeloop::addFuncAfterTimeStep(const Timeloop::FctHandle & h, - -void Timeloop::executeSelectable( const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable, +template < typename TP > +void Timeloop<TP>::executeSelectable( const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable, const Set<SUID> & selector, const std::string & what ) { @@ -192,10 +191,11 @@ void Timeloop::executeSelectable( const selectable::SetSelectableObject<VoidFctN LIKWID_MARKER_STOP( objectName.c_str() ); } -void Timeloop::executeSelectable( const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable, +template < typename TP > +void Timeloop<TP>::executeSelectable( const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable, const Set<SUID> & selector, const std::string & what, - WcTimingPool & timing ) + timing::TimingPool<TP> & timing ) { std::string objectName; const VoidFctNoArguments * exe = selectable.getUnique( selector, objectName ); diff --git a/tests/pe/Refinement.cpp b/tests/pe/Refinement.cpp index 943526d15..96900a625 100644 --- a/tests/pe/Refinement.cpp +++ b/tests/pe/Refinement.cpp @@ -167,7 +167,7 @@ int main( int argc, char ** argv ) createSphere(*globalStorage.get(), forest->getBlockStorage(), storageID, 0, Vec3(5,5,5), 1); createSphere(*globalStorage.get(), forest->getBlockStorage(), storageID, 0, Vec3(15,6,6), 1); - timeloop::SweepTimeloop timeloop( forest->getBlockStorage(), 1 ); + SweepTimeloop timeloop( forest->getBlockStorage(), 1 ); timeloop.addFuncBeforeTimeStep( simpleLB, "refreshFunctorName" ); for (int i = 0; i < 1; ++i) -- GitLab