From 6ce8f928a2571473a4820a76ce29c561c92ad22e Mon Sep 17 00:00:00 2001
From: Richard Angersbach <richard.angersbach@fau.de>
Date: Fri, 30 Jun 2023 15:09:59 +0200
Subject: [PATCH] Missing synchronization for time measurements with CUDA

---
 .../FlowAroundSphereCodeGen/CMakeLists.txt    |   8 +-
 .../PhaseFieldAllenCahn/CMakeLists.txt        |   4 +-
 apps/tutorials/codegen/CMakeLists.txt         |   5 +-
 src/CMakeLists.txt                            |   4 +-
 src/core/mpi/Datatype.h                       |  12 +-
 src/core/mpi/MPIWrapper.h                     |  15 +-
 src/core/timing/CMakeLists.txt                |   1 +
 src/core/timing/DeviceSynchronizePolicy.h     |  84 +++++
 src/core/timing/Timer.h                       |   6 +-
 src/core/timing/TimingNode.cpp                |   1 +
 src/core/timing/TimingNode.h                  |   1 +
 src/core/timing/TimingPool.cpp                |   1 +
 src/core/timing/TimingPool.h                  |   1 +
 src/core/timing/TimingTree.cpp                |   1 +
 src/core/timing/TimingTree.h                  |   1 +
 src/gpu/AlignedAllocation.cpp                 |  15 +-
 src/gpu/CMakeLists.txt                        |   1 +
 src/gpu/DeviceSelectMPI.cpp                   |  59 ++--
 src/gpu/DeviceSelectMPI.h                     |   1 +
 src/gpu/DeviceWrapper.h                       | 292 ++++++++++++++++++
 src/gpu/ErrorChecking.h                       |   6 +-
 src/gpu/FieldAccessor.h                       |   2 +-
 src/gpu/FieldCopy.h                           | 176 +++++------
 src/gpu/FieldIndexing.h                       |   1 +
 src/gpu/FieldIndexing.impl.h                  |  24 +-
 src/gpu/FieldIndexing3D.impl.h                |  24 +-
 src/gpu/FieldIndexingXYZ.h                    |   3 +
 src/gpu/FieldIndexingXYZ.impl.h               |  24 +-
 src/gpu/GPUCopy.h                             |   3 +-
 src/gpu/GPUField.h                            |   6 +-
 src/gpu/GPUField.impl.h                       |  35 ++-
 src/gpu/HostFieldAllocator.h                  |  30 +-
 src/gpu/Kernel.h                              |   5 +-
 src/gpu/NVTX.h                                |  69 +++--
 src/gpu/ParallelStreams.cpp                   |  59 ++--
 src/gpu/communication/UniformGPUScheme.impl.h |   2 +-
 .../gpu/BasicRecursiveTimeStepGPU.h           |   2 +-
 .../gpu/BasicRecursiveTimeStepGPU.impl.h      |   2 +-
 .../refinement/BasicRecursiveTimeStep.h       |   2 +-
 .../refinement/BasicRecursiveTimeStep.impl.h  |   2 +-
 src/timeloop/CMakeLists.txt                   |   2 -
 src/timeloop/SelectableFunctionCreators.h     |   2 +-
 src/timeloop/SweepTimeloop.h                  |  14 +-
 ...SweepTimeloop.cpp => SweepTimeloop.impl.h} |  15 +-
 src/timeloop/Timeloop.h                       |  22 +-
 .../{Timeloop.cpp => Timeloop.impl.h}         |  58 ++--
 tests/pe/Refinement.cpp                       |   2 +-
 47 files changed, 786 insertions(+), 319 deletions(-)
 create mode 100644 src/core/timing/DeviceSynchronizePolicy.h
 create mode 100644 src/gpu/DeviceWrapper.h
 rename src/timeloop/{SweepTimeloop.cpp => SweepTimeloop.impl.h} (90%)
 rename src/timeloop/{Timeloop.cpp => Timeloop.impl.h} (83%)

diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt b/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt
index 40a17bda2..17cfd93fd 100644
--- a/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt
@@ -11,10 +11,10 @@ waLBerla_generate_target_from_python(NAME FlowAroundSphereGenerated
         FlowAroundSphereCodeGen_PackInfoOdd.${CODEGEN_FILE_SUFFIX} FlowAroundSphereCodeGen_PackInfoOdd.h
         FlowAroundSphereCodeGen_InfoHeader.h)
 
-if (WALBERLA_BUILD_WITH_CUDA)
-        waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp
-                DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated)
+if (WALBERLA_BUILD_WITH_GPU_SUPPORT )
+    waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp
+            DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated)
 else ()
     waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp
             DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated)
-endif (WALBERLA_BUILD_WITH_CUDA)
+endif (WALBERLA_BUILD_WITH_GPU_SUPPORT )
\ No newline at end of file
diff --git a/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt b/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
index 52d29a0fb..1b530d61a 100644
--- a/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
+++ b/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
@@ -12,7 +12,7 @@ waLBerla_generate_target_from_python(NAME BenchmarkPhaseFieldCodeGen
         PackInfo_velocity_based_distributions.${CODEGEN_FILE_SUFFIX} PackInfo_velocity_based_distributions.h
         GenDefines.h)
 
-if (WALBERLA_BUILD_WITH_CUDA)
+if (WALBERLA_BUILD_WITH_GPU_SUPPORT )
     waLBerla_add_executable(NAME benchmark_multiphase
             FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
             DEPENDS blockforest core gpu field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
@@ -20,5 +20,5 @@ else ()
     waLBerla_add_executable(NAME benchmark_multiphase
             FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
             DEPENDS blockforest core field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
-endif (WALBERLA_BUILD_WITH_CUDA)
+endif (WALBERLA_BUILD_WITH_GPU_SUPPORT )
 
diff --git a/apps/tutorials/codegen/CMakeLists.txt b/apps/tutorials/codegen/CMakeLists.txt
index 2a56a5b67..4b50efaa3 100644
--- a/apps/tutorials/codegen/CMakeLists.txt
+++ b/apps/tutorials/codegen/CMakeLists.txt
@@ -24,7 +24,6 @@ if( WALBERLA_BUILD_WITH_CODEGEN )
                     DEPENDS blockforest core domain_decomposition field geometry timeloop lbm stencil vtk 02_LBMLatticeModelGenerationPython )
 
     #   Tutorial 3: Advanced lbmpy Code Generation
-
     walberla_generate_target_from_python( NAME 03_AdvancedLBMCodegenPython
         FILE 03_AdvancedLBMCodegen.py
         OUT_FILES   CumulantMRTSweep.${CODEGEN_FILE_SUFFIX} CumulantMRTSweep.h
@@ -34,8 +33,8 @@ if( WALBERLA_BUILD_WITH_CODEGEN )
 
     if(WALBERLA_BUILD_WITH_GPU_SUPPORT)
         walberla_add_executable ( NAME 03_AdvancedLBMCodegenApp
-                        FILES 03_AdvancedLBMCodegen.cpp
-                        DEPENDS blockforest gpu core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython )
+                FILES 03_AdvancedLBMCodegen.cpp
+                DEPENDS blockforest gpu core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython )
     else()
         walberla_add_executable ( NAME 03_AdvancedLBMCodegenApp
                 FILES 03_AdvancedLBMCodegen.cpp
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 92b465e32..d49a1e63b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -24,9 +24,7 @@ add_subdirectory( blockforest )
 add_subdirectory( boundary )
 add_subdirectory( communication )
 add_subdirectory( core )
-if ( WALBERLA_BUILD_WITH_GPU_SUPPORT )
-   add_subdirectory(gpu)
-endif()
+add_subdirectory(gpu)
 add_subdirectory( domain_decomposition )
 add_subdirectory( executiontree )
 if ( WALBERLA_BUILD_WITH_FFT AND FFTW3_FOUND )
diff --git a/src/core/mpi/Datatype.h b/src/core/mpi/Datatype.h
index 80b7931ef..f717cb6d9 100644
--- a/src/core/mpi/Datatype.h
+++ b/src/core/mpi/Datatype.h
@@ -40,23 +40,17 @@ namespace mpi {
 
       Datatype( MPI_Datatype datatype) : mpiDatatype_( datatype )
       {
-#ifdef WALBERLA_BUILD_WITH_MPI
-         MPI_Type_commit( &mpiDatatype_ );
-#endif
+         WALBERLA_MPI_SECTION() { MPI_Type_commit( &mpiDatatype_ ); }
       }
 
       void init( MPI_Datatype datatype )
       {
          mpiDatatype_ = datatype;
-#ifdef WALBERLA_BUILD_WITH_MPI
-         MPI_Type_commit( &mpiDatatype_ );
-#endif
+         WALBERLA_MPI_SECTION() { MPI_Type_commit( &mpiDatatype_ ); }
       }
 
       ~Datatype() {
-#ifdef WALBERLA_BUILD_WITH_MPI
-         MPI_Type_free( & mpiDatatype_ );
-#endif
+         WALBERLA_MPI_SECTION() { MPI_Type_free( & mpiDatatype_ ); }
       }
 
       operator MPI_Datatype() const {
diff --git a/src/core/mpi/MPIWrapper.h b/src/core/mpi/MPIWrapper.h
index cd250cb97..6b406c631 100644
--- a/src/core/mpi/MPIWrapper.h
+++ b/src/core/mpi/MPIWrapper.h
@@ -120,6 +120,8 @@ struct MPI_Status
 const int MPI_COMM_NULL  = 0;
 const int MPI_COMM_WORLD = 1;
 
+const int MPI_COMM_TYPE_SHARED = 0;
+
 const int MPI_SUCCESS = 1;
 
 
@@ -202,11 +204,14 @@ inline int MPI_Comm_size( MPI_Comm, int* ) { WALBERLA_MPI_FUNCTION_ERROR }
 inline int MPI_Comm_rank( MPI_Comm, int* ) { WALBERLA_MPI_FUNCTION_ERROR }
 inline int MPI_Comm_get_name( MPI_Comm, char*, int* ) { WALBERLA_MPI_FUNCTION_ERROR }
 
-inline int MPI_Comm_group ( MPI_Comm, MPI_Group* )           { WALBERLA_MPI_FUNCTION_ERROR }
-inline int MPI_Comm_create( MPI_Comm, MPI_Group, MPI_Comm* ) { WALBERLA_MPI_FUNCTION_ERROR }
-inline int MPI_Comm_free  ( MPI_Comm* )                      { WALBERLA_MPI_FUNCTION_ERROR }
-inline int MPI_Comm_dup   ( MPI_Comm, MPI_Comm *)            { WALBERLA_MPI_FUNCTION_ERROR }
-inline int MPI_Comm_split ( MPI_Comm, int, int, MPI_Comm *)  { WALBERLA_MPI_FUNCTION_ERROR }
+inline int MPI_Info_create ( MPI_Info * ) { WALBERLA_MPI_FUNCTION_ERROR }
+
+inline int MPI_Comm_group ( MPI_Comm, MPI_Group* )                         { WALBERLA_MPI_FUNCTION_ERROR }
+inline int MPI_Comm_create( MPI_Comm, MPI_Group, MPI_Comm* )               { WALBERLA_MPI_FUNCTION_ERROR }
+inline int MPI_Comm_free  ( MPI_Comm* )                                    { WALBERLA_MPI_FUNCTION_ERROR }
+inline int MPI_Comm_dup   ( MPI_Comm, MPI_Comm *)                          { WALBERLA_MPI_FUNCTION_ERROR }
+inline int MPI_Comm_split ( MPI_Comm, int, int, MPI_Comm *)                { WALBERLA_MPI_FUNCTION_ERROR }
+inline int MPI_Comm_split_type ( MPI_Comm, int, int, MPI_Info, MPI_Comm *) { WALBERLA_MPI_FUNCTION_ERROR }
 
 
 inline int MPI_Cart_create( MPI_Comm, int, int*, int*, int, MPI_Comm* ) { WALBERLA_MPI_FUNCTION_ERROR }
diff --git a/src/core/timing/CMakeLists.txt b/src/core/timing/CMakeLists.txt
index b949b2eeb..1de08d962 100644
--- a/src/core/timing/CMakeLists.txt
+++ b/src/core/timing/CMakeLists.txt
@@ -16,4 +16,5 @@ target_sources( core
          TimingTree.cpp
          TimingTree.h
          WcPolicy.h
+         DeviceSynchronizePolicy.h
       )
diff --git a/src/core/timing/DeviceSynchronizePolicy.h b/src/core/timing/DeviceSynchronizePolicy.h
new file mode 100644
index 000000000..7c494e48d
--- /dev/null
+++ b/src/core/timing/DeviceSynchronizePolicy.h
@@ -0,0 +1,84 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file DeviceSynchronizePolicy.h
+//! \ingroup core
+//! \author Richard Angersbach
+//! \brief Gpu Timing Policy
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "gpu/DeviceWrapper.h"
+
+#include "Time.h"
+
+namespace walberla
+{
+namespace timing
+{
+
+//======================================================================================================================
+//
+//  CLASS DEFINITION
+//
+//======================================================================================================================
+
+//**********************************************************************************************************************
+/*!\brief Timing policy for the measurement of the GPU time.
+// \ingroup timing
+//
+// The DeviceSynchronizePolicy class represents the timing policy for GPU time measurements that can be used
+// in combination with the Timer class template. This combination is realized with the DeviceSynchronizePolicy
+// type definition.
+// This class uses device synchronization internally and is therefore not suited for CUDA
+// applications with overlapping kernels.
+*/
+struct DeviceSynchronizePolicy
+{
+ public:
+   //**Timing functions****************************************************************************
+   /*!\name Timing functions */
+   //@{
+   static inline double getTimestamp();
+   //@}
+   //*******************************************************************************************************************
+};
+//**********************************************************************************************************************
+
+//======================================================================================================================
+//
+//  TIMING FUNCTIONS
+//
+//======================================================================================================================
+
+//**********************************************************************************************************************
+/*!\brief Returns a timestamp of the current GPU time in seconds. Uses wall clock time and device synchronization
+internally.
+//
+// \return GPU timestamp in seconds.
+*/
+inline double DeviceSynchronizePolicy::getTimestamp()
+{
+   // synchronize device before getting timestamp
+   WALBERLA_DEVICE_SECTION() { gpuDeviceSynchronize(); }
+
+   return getWcTime();
+}
+//**********************************************************************************************************************
+
+} // namespace timing
+} // namespace walberla
diff --git a/src/core/timing/Timer.h b/src/core/timing/Timer.h
index 32c1e7f30..9f7c3f97d 100644
--- a/src/core/timing/Timer.h
+++ b/src/core/timing/Timer.h
@@ -25,14 +25,17 @@
 #pragma once
 
 #include "CpuPolicy.h"
+#include "DeviceSynchronizePolicy.h"
 #include "ReduceType.h"
 #include "WcPolicy.h"
-#include "core/DataTypes.h"
 
+#include "core/DataTypes.h"
 #include "core/mpi/RecvBuffer.h"
 #include "core/mpi/Reduce.h"
 #include "core/mpi/SendBuffer.h"
 
+#include "gpu/DeviceWrapper.h"
+
 #include <iomanip>
 #include <iostream>
 #include <limits>
@@ -590,6 +593,7 @@ mpi::GenericRecvBuffer<T>& operator>>( mpi::GenericRecvBuffer<T> & buf, Timer<TP
 } //namespace timing
 
 using CpuTimer = timing::Timer<timing::CpuPolicy>;
+using DeviceSynchronizeTimer = timing::Timer<timing::DeviceSynchronizePolicy>;
 using WcTimer = timing::Timer<timing::WcPolicy>;
 
 } // namespace walberla
diff --git a/src/core/timing/TimingNode.cpp b/src/core/timing/TimingNode.cpp
index 3e0cf4df5..c75cd1414 100644
--- a/src/core/timing/TimingNode.cpp
+++ b/src/core/timing/TimingNode.cpp
@@ -29,6 +29,7 @@ namespace timing {
 
 // Explicit instantiation
 template struct TimingNode<WcPolicy>;
+template struct TimingNode<DeviceSynchronizePolicy>;
 template struct TimingNode<CpuPolicy>;
 
 } // namespace timing
diff --git a/src/core/timing/TimingNode.h b/src/core/timing/TimingNode.h
index 5b9c29aa2..0b6326e71 100644
--- a/src/core/timing/TimingNode.h
+++ b/src/core/timing/TimingNode.h
@@ -494,6 +494,7 @@ void addRemainderNodes(timing::TimingNode<TP> &tn) {
 }
 
 using WcTimingNode = timing::TimingNode<timing::WcPolicy>;
+using DeviceSynchronizeTimingNode = timing::TimingNode<timing::DeviceSynchronizePolicy>;
 using CpuTimingNode = timing::TimingNode<timing::CpuPolicy>;
 
 }
diff --git a/src/core/timing/TimingPool.cpp b/src/core/timing/TimingPool.cpp
index 7539fffe3..28cf668f2 100644
--- a/src/core/timing/TimingPool.cpp
+++ b/src/core/timing/TimingPool.cpp
@@ -474,6 +474,7 @@ void TimingPool<TP>::clear ()
 
 // Explicit instantiation
 template class TimingPool<WcPolicy>;
+template class TimingPool<DeviceSynchronizePolicy>;
 template class TimingPool<CpuPolicy>;
 
 
diff --git a/src/core/timing/TimingPool.h b/src/core/timing/TimingPool.h
index 5e41c14d7..2d5ed0996 100644
--- a/src/core/timing/TimingPool.h
+++ b/src/core/timing/TimingPool.h
@@ -249,5 +249,6 @@ namespace timing {
 
 namespace walberla {
    using WcTimingPool = timing::TimingPool<timing::WcPolicy>;
+   using DeviceSynchronizeTimingPool = timing::TimingPool<timing::DeviceSynchronizePolicy>;
    using CpuTimingPool = timing::TimingPool<timing::CpuPolicy>;
 }
diff --git a/src/core/timing/TimingTree.cpp b/src/core/timing/TimingTree.cpp
index fc891c31a..14cd47232 100644
--- a/src/core/timing/TimingTree.cpp
+++ b/src/core/timing/TimingTree.cpp
@@ -29,6 +29,7 @@ namespace timing {
 
 // Explicit instantiation
 template class TimingTree<WcPolicy>;
+template class TimingTree<DeviceSynchronizePolicy>;
 template class TimingTree<CpuPolicy>;
 
 } // namespace timing
diff --git a/src/core/timing/TimingTree.h b/src/core/timing/TimingTree.h
index 63d85d243..5cf06167e 100644
--- a/src/core/timing/TimingTree.h
+++ b/src/core/timing/TimingTree.h
@@ -259,5 +259,6 @@ TimingTree< TP > TimingTree< TP >::getCopyWithRemainder() const
 }
 
 using WcTimingTree = timing::TimingTree<timing::WcPolicy>;
+using DeviceSynchronizeTimingTree = timing::TimingTree<timing::DeviceSynchronizePolicy>;
 using CpuTimingTree = timing::TimingTree<timing::CpuPolicy>;
 }
diff --git a/src/gpu/AlignedAllocation.cpp b/src/gpu/AlignedAllocation.cpp
index 2a2bee41c..65e58c79a 100644
--- a/src/gpu/AlignedAllocation.cpp
+++ b/src/gpu/AlignedAllocation.cpp
@@ -20,6 +20,7 @@
 //======================================================================================================================
 
 #include "AlignedAllocation.h"
+#include "gpu/DeviceWrapper.h"
 #include "gpu/ErrorChecking.h"
 #include "core/debug/CheckFunctions.h"
 #include "core/debug/Debug.h"
@@ -34,6 +35,11 @@ namespace gpu
 
    void *allocate_aligned_with_offset( uint_t size, uint_t alignment, uint_t offset )
    {
+      WALBERLA_NON_DEVICE_SECTION()
+      {
+         WALBERLA_ABORT(__FUNCTION__ << "Using GPU method without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.")
+      }
+
       // With 0 alignment this function makes no sense
       // use normal malloc instead
       WALBERLA_ASSERT_GREATER( alignment, 0 )
@@ -50,8 +56,8 @@ namespace gpu
          return result;
       }
 
-      void *pa;  // pointer to allocated memory
-      void *ptr; // pointer to usable aligned memory
+      void *pa = nullptr;   // pointer to allocated memory
+      void *ptr = nullptr;  // pointer to usable aligned memory
 
       WALBERLA_GPU_CHECK( gpuMalloc( &pa, size + alignment ));
       WALBERLA_CHECK_EQUAL(size_t(pa) % alignment, 0 , "GPU malloc did not return memory with requested alignment");
@@ -65,6 +71,11 @@ namespace gpu
 
    void free_aligned_with_offset( void *ptr )
    {
+      WALBERLA_NON_DEVICE_SECTION()
+      {
+         WALBERLA_ABORT(__FUNCTION__ << "Using GPU method without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.")
+      }
+
       // assume that pointer to real allocated chunk is stored just before
       // chunk that was given to user
       WALBERLA_GPU_CHECK( gpuFree( freePointers_[ptr] ));
diff --git a/src/gpu/CMakeLists.txt b/src/gpu/CMakeLists.txt
index a8e58ab49..fb6810d4e 100644
--- a/src/gpu/CMakeLists.txt
+++ b/src/gpu/CMakeLists.txt
@@ -20,6 +20,7 @@ target_sources( gpu
       AddGPUFieldToStorage.impl.h
       GPUField.h
       GPUWrapper.h
+      DeviceWrapper.h
       FieldAccessor3D.h
       DeviceSelectMPI.h
       HostFieldAllocator.h
diff --git a/src/gpu/DeviceSelectMPI.cpp b/src/gpu/DeviceSelectMPI.cpp
index 52454653b..81b87b3de 100644
--- a/src/gpu/DeviceSelectMPI.cpp
+++ b/src/gpu/DeviceSelectMPI.cpp
@@ -38,38 +38,41 @@ void selectDeviceBasedOnMpiRank() {
 
 void selectDeviceBasedOnMpiRank()
 {
-#ifdef WALBERLA_BUILD_WITH_MPI
-   int deviceCount;
-   WALBERLA_GPU_CHECK( gpuGetDeviceCount( &deviceCount ))
-   WALBERLA_LOG_INFO_ON_ROOT( "Selecting device depending on MPI Rank" )
 
-   MPI_Info info;
-   MPI_Info_create( &info );
-   MPI_Comm newCommunicator;
-   MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, info, &newCommunicator );
+   WALBERLA_DEVICE_SECTION()
+   {
+      WALBERLA_MPI_SECTION()
+      {
+         int deviceCount;
+         WALBERLA_GPU_CHECK(gpuGetDeviceCount(&deviceCount))
+         WALBERLA_LOG_INFO_ON_ROOT("Selecting device depending on MPI Rank")
 
-   int processesOnNode;
-   int rankOnNode;
-   MPI_Comm_size( newCommunicator, &processesOnNode );
-   MPI_Comm_rank( newCommunicator, &rankOnNode );
+         MPI_Info info;
+         MPI_Info_create(&info);
+         MPI_Comm newCommunicator;
+         MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, info, &newCommunicator);
 
-   if ( deviceCount == processesOnNode )
-   {
-      WALBERLA_GPU_CHECK( gpuSetDevice( rankOnNode ))
-   }
-   else if ( deviceCount > processesOnNode )
-   {
-      WALBERLA_LOG_WARNING( "Not using all available GPUs on node. Processes on node: "
-                               << processesOnNode << ", available GPUs on node: " << deviceCount )
-      WALBERLA_GPU_CHECK( gpuSetDevice( rankOnNode ))
-   }
-   else
-   {
-      WALBERLA_LOG_WARNING( "Too many processes started per node - should be one per GPU. Number of processes per node "
-                               << processesOnNode << ", available GPUs on node " << deviceCount )
-      WALBERLA_GPU_CHECK( gpuSetDevice( rankOnNode % deviceCount ))
+         int processesOnNode;
+         int rankOnNode;
+         MPI_Comm_size(newCommunicator, &processesOnNode);
+         MPI_Comm_rank(newCommunicator, &rankOnNode);
+
+         if (deviceCount == processesOnNode) { WALBERLA_GPU_CHECK(gpuSetDevice(rankOnNode)) }
+         else if (deviceCount > processesOnNode)
+         {
+            WALBERLA_LOG_WARNING("Not using all available GPUs on node. Processes on node: "
+                                 << processesOnNode << ", available GPUs on node: " << deviceCount)
+            WALBERLA_GPU_CHECK(gpuSetDevice(rankOnNode))
+         }
+         else
+         {
+            WALBERLA_LOG_WARNING(
+               "Too many processes started per node - should be one per GPU. Number of processes per node "
+               << processesOnNode << ", available GPUs on node " << deviceCount)
+            WALBERLA_GPU_CHECK(gpuSetDevice(rankOnNode % deviceCount))
+         }
+      }
    }
-#endif
 }
 
 #endif
diff --git a/src/gpu/DeviceSelectMPI.h b/src/gpu/DeviceSelectMPI.h
index 34d763f93..5ed18edf5 100644
--- a/src/gpu/DeviceSelectMPI.h
+++ b/src/gpu/DeviceSelectMPI.h
@@ -21,6 +21,7 @@
 
 #pragma once
 
+#include "gpu/DeviceWrapper.h"
 
 namespace walberla {
 namespace gpu
diff --git a/src/gpu/DeviceWrapper.h b/src/gpu/DeviceWrapper.h
new file mode 100644
index 000000000..64590bd4a
--- /dev/null
+++ b/src/gpu/DeviceWrapper.h
@@ -0,0 +1,292 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file DeviceWrapper.h
+//! \ingroup gpu
+//! \author Richard Angersbach <richard.angersbach@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+/// \cond internal
+
+#include <sstream>
+#include "core/Abort.h"
+
+// CMake generated header
+#include "waLBerlaDefinitions.h"
+
+// DEVICE SECTION //
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+
+#define WALBERLA_DEVICE_SECTION() if (true)
+#define WALBERLA_NON_DEVICE_SECTION() if (false)
+
+#else
+
+#define WALBERLA_DEVICE_SECTION() if (false)
+#define WALBERLA_NON_DEVICE_SECTION() if (true)
+
+#endif
+
+namespace walberla {
+namespace gpustubs {
+   // empty namespace which can be used
+} // namespace gpustubs
+} // namespace walberla
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+
+// include runtime header
+#include "gpu/GPUWrapper.h"
+
+#else // WALBERLA_BUILD_WITH_GPU_SUPPORT
+
+namespace walberla {
+namespace gpustubs {
+
+// dummy definitions for CUDA/HIP data types and functions in order to guarantee successful compilation without CUDA/HIP enabled
+
+#define WALBERLA_DEVICE_FUNCTION_ERROR \
+   WALBERLA_ABORT("Invalid device function call! In case of compiling without CUDA/HIP, functions are not " \
+                  "available and shouldn't be called!");
+
+#ifndef __CUDACC__
+   #define __device__
+   #define __global__
+   #define __host__
+   #define __forceinline__
+#endif
+
+using gpuError_t = int;
+const gpuError_t gpuSuccess = 0;
+
+#define gpuHostAllocDefault 0x00
+#define gpuHostAllocMapped 0x02
+#define gpuHostAllocPortable 0x01
+#define gpuHostAllocWriteCombined 0x04
+
+using gpuMemcpyKind                          = int;
+const gpuMemcpyKind gpuMemcpyHostToHost     = 0;
+const gpuMemcpyKind gpuMemcpyHostToDevice   = 1;
+const gpuMemcpyKind gpuMemcpyDeviceToHost   = 2;
+const gpuMemcpyKind gpuMemcpyDeviceToDevice = 3;
+const gpuMemcpyKind gpuMemcpyDefault        = 4;
+
+inline const char* gpuGetErrorName(gpuError_t /*code*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline const char* gpuGetErrorString(gpuError_t /*code*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuGetLastError(void) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuPeekAtLastError(void) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+inline gpuError_t gpuMalloc(void** /*devPtr*/, size_t /*size*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuMallocHost(void** /*ptr*/, size_t /*size*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuHostAlloc(void** /*pHost*/, size_t /*size*/, unsigned int /*flags*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+struct gpuPos
+{
+   size_t x, y, z;
+};
+
+struct gpuPitchedPtr
+{
+   size_t pitch;
+   void* ptr;
+   size_t xsize;
+   size_t ysize;
+};
+
+struct gpuExtent
+{
+   size_t depth;
+   size_t height;
+   size_t width;
+};
+
+struct gpuArray;
+typedef struct gpuArray* gpuArray_t;
+typedef struct gpuArray* gpuArray_const_t;
+
+struct CUstream_st;
+typedef struct CUstream_st* gpuStream_t;
+inline gpuError_t gpuStreamDestroy(gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuStreamCreateWithPriority(gpuStream_t* /*pStream*/, unsigned int /*flags*/, int /*priority*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuStreamCreateWithFlags(gpuStream_t* /*pStream*/, unsigned int /*flags*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuDeviceGetStreamPriorityRange(int* /*leastPriority*/, int* /*greatestPriority*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuStreamCreate(gpuStream_t* /*pStream*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuStreamSynchronize(gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+struct gpuMemcpy3DParms
+{
+   gpuArray_t dstArray;
+   gpuPos dstPos;
+   gpuPitchedPtr dstPtr;
+   gpuExtent extent;
+   gpuMemcpyKind kind;
+   gpuArray_t srcArray;
+   gpuPos srcPos;
+   gpuPitchedPtr srcPtr;
+};
+
+inline gpuError_t gpuMemcpy(void* /*dst*/, const void* /*src*/, size_t /*count*/, gpuMemcpyKind /*kind*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuMemcpyAsync(void* /*dst*/, const void* /*src*/, size_t /*count*/, gpuMemcpyKind /*kind*/, gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuMemcpy3D(const gpuMemcpy3DParms* /*p*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuMemcpy3DAsync(const gpuMemcpy3DParms* /*p*/, gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+inline gpuPos make_gpuPos(size_t /*x*/, size_t /*y*/, size_t /*z*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuPitchedPtr make_gpuPitchedPtr (void* /*d*/, size_t /*p*/, size_t /*xsz*/, size_t /*ysz*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuExtent make_gpuExtent(size_t /*w*/, size_t /*h*/, size_t /*d*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+inline gpuError_t gpuFree(void* /*devPtr*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuFreeHost(void* /*ptr*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+inline gpuError_t gpuDeviceSynchronize(void) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+struct CUevent_st;
+typedef struct CUevent_st* gpuEvent_t;
+inline gpuError_t gpuEventCreate(gpuEvent_t* /*event*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuEventCreateWithFlags(gpuEvent_t* /*event*/, unsigned int /*flags*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuEventRecord(gpuEvent_t /*event*/, gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuEventDestroy(gpuEvent_t /*event*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuStreamWaitEvent (gpuStream_t /*stream*/, gpuEvent_t /*event*/, unsigned int /*flags*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+#define gpuStreamDefault 0x00
+
+inline gpuError_t gpuGetDeviceCount(int* /*count*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuSetDevice(int /*device*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+struct CUuuid_st
+{
+   char bytes;
+};
+typedef struct CUuuid_st gpuUUID_t;
+
+struct gpuDeviceProp
+{
+   char name[256];
+   gpuUUID_t uuid;
+   size_t totalGlobalMem;
+   size_t sharedMemPerBlock;
+   int regsPerBlock;
+   int warpSize;
+   size_t memPitch;
+   int maxThreadsPerBlock;
+   int maxThreadsDim[3];
+   int maxGridSize[3];
+   int clockRate;
+   size_t totalConstMem;
+   int major;
+   int minor;
+   size_t textureAlignment;
+   size_t texturePitchAlignment;
+   int deviceOverlap;
+   int multiProcessorCount;
+   int kernelExecTimeoutEnabled;
+   int integrated;
+   int canMapHostMemory;
+   int computeMode;
+   int maxTexture1D;
+   int maxTexture1DMipmap;
+   int maxTexture1DLinear;
+   int maxTexture2D[2];
+   int maxTexture2DMipmap[2];
+   int maxTexture2DLinear[3];
+   int maxTexture2DGather[2];
+   int maxTexture3D[3];
+   int maxTexture3DAlt[3];
+   int maxTextureCubemap;
+   int maxTexture1DLayered[2];
+   int maxTexture2DLayered[3];
+   int maxTextureCubemapLayered[2];
+   int maxSurface1D;
+   int maxSurface2D[2];
+   int maxSurface3D[3];
+   int maxSurface1DLayered[2];
+   int maxSurface2DLayered[3];
+   int maxSurfaceCubemap;
+   int maxSurfaceCubemapLayered[2];
+   size_t surfaceAlignment;
+   int concurrentKernels;
+   int ECCEnabled;
+   int pciBusID;
+   int pciDeviceID;
+   int pciDomainID;
+   int tccDriver;
+   int asyncEngineCount;
+   int unifiedAddressing;
+   int memoryClockRate;
+   int memoryBusWidth;
+   int l2CacheSize;
+   int persistingL2CacheMaxSize;
+   int maxThreadsPerMultiProcessor;
+   int streamPrioritiesSupported;
+   int globalL1CacheSupported;
+   int localL1CacheSupported;
+   size_t sharedMemPerMultiprocessor;
+   int regsPerMultiprocessor;
+   int managedMemory;
+   int isMultiGpuBoard;
+   int multiGpuBoardGroupID;
+   int singleToDoublePrecisionPerfRatio;
+   int pageableMemoryAccess;
+   int concurrentManagedAccess;
+   int computePreemptionSupported;
+   int canUseHostPointerForRegisteredMem;
+   int cooperativeLaunch;
+   int cooperativeMultiDeviceLaunch;
+   int pageableMemoryAccessUsesHostPageTables;
+   int directManagedMemAccessFromHost;
+   int accessPolicyMaxWindowSize;
+};
+inline gpuError_t gpuGetDeviceProperties(gpuDeviceProp* /*prop*/, int /*device*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+struct uint3
+{
+   unsigned int x, y, z;
+};
+typedef struct uint3 uint3;
+
+struct dim3
+{
+   unsigned int x, y, z;
+   dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
+   dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
+   operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; }
+};
+typedef struct dim3 dim3;
+
+inline gpuError_t gpuLaunchKernel(const void* /*func*/, dim3 /*gridDim*/, dim3 /*blockDim*/, void** /*args*/, size_t /*sharedMem*/, gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+#ifdef _WIN32
+#define GPURT_CB __stdcall
+#else
+#define GPURT_CB
+#endif
+
+typedef void(GPURT_CB* gpuHostFn_t)(void* /*userData*/);
+inline gpuError_t gpuLaunchHostFunc(gpuStream_t /*stream*/, gpuHostFn_t /*fn*/, void* /*userData*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+#undef WALBERLA_DEVICE_FUNCTION_ERROR
+
+} // namespace gpustubs
+using namespace gpustubs;
+
+} // namespace walberla
+
+
+#endif // WALBERLA_BUILD_WITH_GPU_SUPPORT
+
+/// \endcond
diff --git a/src/gpu/ErrorChecking.h b/src/gpu/ErrorChecking.h
index a80ef0334..7031a936f 100644
--- a/src/gpu/ErrorChecking.h
+++ b/src/gpu/ErrorChecking.h
@@ -25,14 +25,14 @@
 
 #include <sstream>
 
-#include "gpu/GPUWrapper.h"
+#include "gpu/DeviceWrapper.h"
 
 namespace walberla {
 namespace gpu {
 
 
 #define WALBERLA_GPU_CHECK(ans) { ::walberla::gpu::checkForError((ans), __FILE__, __LINE__); }
-#define WALBERLA_GPU_CHECK_LAST_ERROR() {::walberla::gpu::checkForLastError(__FILE__, __LINE__);}
+#define WALBERLA_GPU_CHECK_LAST_ERROR() { ::walberla::gpu::checkForLastError(__FILE__, __LINE__); }
 
 
 
@@ -56,7 +56,7 @@ inline void checkForLastError( const std::string & callerPath, const int line )
    if(code != gpuSuccess)
    {
       std::stringstream ss;
-      ss << "CUDA Error: " << code << " " << cudaGetErrorName(code) << ": " << cudaGetErrorString( code );
+      ss << "CUDA Error: " << code << " " << gpuGetErrorName(code) << ": " << gpuGetErrorString( code );
       Abort::instance()->abort( ss.str(), callerPath, line );
    }
 }
diff --git a/src/gpu/FieldAccessor.h b/src/gpu/FieldAccessor.h
index 798440d12..cd50cc58d 100644
--- a/src/gpu/FieldAccessor.h
+++ b/src/gpu/FieldAccessor.h
@@ -22,8 +22,8 @@
 #pragma once
 
 #include "core/DataTypes.h"
-
 #include "gpu/GPUWrapper.h"
+#include "gpu/DeviceWrapper.h"
 
 namespace walberla {
 namespace gpu
diff --git a/src/gpu/FieldCopy.h b/src/gpu/FieldCopy.h
index 13c079074..6895661ec 100644
--- a/src/gpu/FieldCopy.h
+++ b/src/gpu/FieldCopy.h
@@ -98,56 +98,48 @@ namespace gpu
    template<typename T, uint_t fs>
    void fieldCpy(gpu::GPUField<T> & dst, const field::Field<T,fs> & src )
    {
-      gpuMemcpy3DParms p;
-      memset( &p, 0, sizeof(p) );
-
-
-      if ( dst.layout() != src.layout() ) {
-         WALBERLA_ABORT( "Cannot copy fields with different layout" )
-      }
-
-      bool canCopy = ( src.layout()     == fzyx &&
-                       dst.fAllocSize() == src.fAllocSize() &&
-                       dst.zAllocSize() == src.zAllocSize() &&
-                       dst.yAllocSize() == src.yAllocSize() &&
-                       dst.xSize()      == src.xSize() )
-                      ||
-                      ( src.layout()     == zyxf &&
-                        dst.zAllocSize() == src.zAllocSize() &&
-                        dst.yAllocSize() == src.yAllocSize() &&
-                        dst.xAllocSize() == src.xAllocSize() &&
-                        dst.fSize()      == src.fSize() );
-
-      if ( !canCopy ) {
-         WALBERLA_ABORT("Field have to have the same size ")
-      }
-
-      if ( dst.layout() == fzyx )
-      {
-         p.srcPtr = make_gpuPitchedPtr( (void*)(src.data()),          // pointer
-                                         sizeof(T) * src.xAllocSize(), // pitch
-                                         src.xAllocSize(),             // inner dimension size
-                                         src.yAllocSize()  );          // next outer dimension size
-
-         p.extent.width  = std::min( dst.xAllocSize(), src.xAllocSize() ) * sizeof(T);
-         p.extent.height = dst.yAllocSize();
-         p.extent.depth  = dst.zAllocSize() * dst.fAllocSize();
-      }
-      else
+      WALBERLA_DEVICE_SECTION()
       {
-         p.srcPtr = make_gpuPitchedPtr( (void*)(src.data()),          // pointer
-                                         sizeof(T) * src.fAllocSize(), // pitch
-                                         src.fAllocSize(),             // inner dimension size
-                                         src.xAllocSize()  );          // next outer dimension size
-
-         p.extent.width  = std::min( dst.fAllocSize(), src.fAllocSize() ) * sizeof(T);
-         p.extent.height = dst.xAllocSize();
-         p.extent.depth  = dst.yAllocSize() * dst.zAllocSize();
+         gpuMemcpy3DParms p;
+         memset(&p, 0, sizeof(p));
+
+         if (dst.layout() != src.layout()) { WALBERLA_ABORT("Cannot copy fields with different layout") }
+
+         bool canCopy =
+            (src.layout() == fzyx && dst.fAllocSize() == src.fAllocSize() && dst.zAllocSize() == src.zAllocSize() &&
+             dst.yAllocSize() == src.yAllocSize() && dst.xSize() == src.xSize()) ||
+            (src.layout() == zyxf && dst.zAllocSize() == src.zAllocSize() && dst.yAllocSize() == src.yAllocSize() &&
+             dst.xAllocSize() == src.xAllocSize() && dst.fSize() == src.fSize());
+
+         if (!canCopy) { WALBERLA_ABORT("Field have to have the same size ") }
+
+         if (dst.layout() == fzyx)
+         {
+            p.srcPtr = make_gpuPitchedPtr((void*) (src.data()),         // pointer
+                                          sizeof(T) * src.xAllocSize(), // pitch
+                                          src.xAllocSize(),             // inner dimension size
+                                          src.yAllocSize());            // next outer dimension size
+
+            p.extent.width  = std::min(dst.xAllocSize(), src.xAllocSize()) * sizeof(T);
+            p.extent.height = dst.yAllocSize();
+            p.extent.depth  = dst.zAllocSize() * dst.fAllocSize();
+         }
+         else
+         {
+            p.srcPtr = make_gpuPitchedPtr((void*) (src.data()),         // pointer
+                                          sizeof(T) * src.fAllocSize(), // pitch
+                                          src.fAllocSize(),             // inner dimension size
+                                          src.xAllocSize());            // next outer dimension size
+
+            p.extent.width  = std::min(dst.fAllocSize(), src.fAllocSize()) * sizeof(T);
+            p.extent.height = dst.xAllocSize();
+            p.extent.depth  = dst.yAllocSize() * dst.zAllocSize();
+         }
+
+         p.dstPtr = dst.pitchedPtr();
+         p.kind   = gpuMemcpyHostToDevice;
+         WALBERLA_GPU_CHECK(gpuMemcpy3D(&p))
       }
-
-      p.dstPtr = dst.pitchedPtr();
-      p.kind = gpuMemcpyHostToDevice;
-      WALBERLA_GPU_CHECK( gpuMemcpy3D( &p ) )
    }
 
 
@@ -155,56 +147,48 @@ namespace gpu
    template<typename T, uint_t fs>
    void fieldCpy( field::Field<T,fs> & dst, const gpu::GPUField<T> & src )
    {
-      gpuMemcpy3DParms p;
-      memset( &p, 0, sizeof(p) );
-
-      if ( dst.layout() != src.layout() ) {
-         WALBERLA_ABORT( "Cannot copy fields with different layout" )
-      }
-
-      bool canCopy = ( src.layout()     == fzyx &&
-                       dst.fAllocSize() == src.fAllocSize() &&
-                       dst.zAllocSize() == src.zAllocSize() &&
-                       dst.yAllocSize() == src.yAllocSize() &&
-                       dst.xSize()      == src.xSize() )
-                      ||
-                      ( src.layout()     == zyxf &&
-                        dst.zAllocSize() == src.zAllocSize() &&
-                        dst.yAllocSize() == src.yAllocSize() &&
-                        dst.xAllocSize() == src.xAllocSize() &&
-                        dst.fSize()      == src.fSize() );
-
-      if ( !canCopy ) {
-         WALBERLA_ABORT("Field have to have the same size ")
-      }
-
-      if ( dst.layout() == fzyx )
+      WALBERLA_DEVICE_SECTION()
       {
-         p.dstPtr = make_gpuPitchedPtr( (void*)(dst.data()),          // pointer
-                                         sizeof(T) * dst.xAllocSize(), // pitch
-                                         dst.xAllocSize(),             // inner dimension size
-                                         dst.yAllocSize()  );          // next outer dimension size
-
-         p.extent.width  = std::min( dst.xAllocSize(), src.xAllocSize() ) * sizeof(T);
-         p.extent.height = dst.yAllocSize();
-         p.extent.depth  = dst.zAllocSize() * dst.fAllocSize();
+         gpuMemcpy3DParms p;
+         memset(&p, 0, sizeof(p));
+
+         if (dst.layout() != src.layout()) { WALBERLA_ABORT("Cannot copy fields with different layout") }
+
+         bool canCopy =
+            (src.layout() == fzyx && dst.fAllocSize() == src.fAllocSize() && dst.zAllocSize() == src.zAllocSize() &&
+             dst.yAllocSize() == src.yAllocSize() && dst.xSize() == src.xSize()) ||
+            (src.layout() == zyxf && dst.zAllocSize() == src.zAllocSize() && dst.yAllocSize() == src.yAllocSize() &&
+             dst.xAllocSize() == src.xAllocSize() && dst.fSize() == src.fSize());
+
+         if (!canCopy) { WALBERLA_ABORT("Field have to have the same size ") }
+
+         if (dst.layout() == fzyx)
+         {
+            p.dstPtr = make_gpuPitchedPtr((void*) (dst.data()),         // pointer
+                                          sizeof(T) * dst.xAllocSize(), // pitch
+                                          dst.xAllocSize(),             // inner dimension size
+                                          dst.yAllocSize());            // next outer dimension size
+
+            p.extent.width  = std::min(dst.xAllocSize(), src.xAllocSize()) * sizeof(T);
+            p.extent.height = dst.yAllocSize();
+            p.extent.depth  = dst.zAllocSize() * dst.fAllocSize();
+         }
+         else
+         {
+            p.dstPtr = make_gpuPitchedPtr((void*) (dst.data()),         // pointer
+                                          sizeof(T) * dst.fAllocSize(), // pitch
+                                          dst.fAllocSize(),             // inner dimension size
+                                          dst.xAllocSize());            // next outer dimension size
+
+            p.extent.width  = std::min(dst.fAllocSize(), src.fAllocSize()) * sizeof(T);
+            p.extent.height = dst.xAllocSize();
+            p.extent.depth  = dst.yAllocSize() * dst.zAllocSize();
+         }
+
+         p.srcPtr = src.pitchedPtr();
+         p.kind   = gpuMemcpyDeviceToHost;
+         WALBERLA_GPU_CHECK(gpuMemcpy3D(&p))
       }
-      else
-      {
-         p.dstPtr = make_gpuPitchedPtr( (void*)(dst.data()),          // pointer
-                                         sizeof(T) * dst.fAllocSize(), // pitch
-                                         dst.fAllocSize(),             // inner dimension size
-                                         dst.xAllocSize()  );          // next outer dimension size
-
-         p.extent.width  = std::min( dst.fAllocSize(), src.fAllocSize() ) * sizeof(T);
-         p.extent.height = dst.xAllocSize();
-         p.extent.depth  = dst.yAllocSize() * dst.zAllocSize();
-      }
-
-      p.srcPtr = src.pitchedPtr();
-      p.kind = gpuMemcpyDeviceToHost;
-      WALBERLA_GPU_CHECK( gpuMemcpy3D( &p ) )
-
    }
 
 } // namespace gpu
diff --git a/src/gpu/FieldIndexing.h b/src/gpu/FieldIndexing.h
index c11953e4f..51b337e61 100644
--- a/src/gpu/FieldIndexing.h
+++ b/src/gpu/FieldIndexing.h
@@ -23,6 +23,7 @@
 #pragma once
 
 #include "stencil/Directions.h"
+#include "gpu/DeviceWrapper.h"
 
 #include "FieldAccessor.h"
 
diff --git a/src/gpu/FieldIndexing.impl.h b/src/gpu/FieldIndexing.impl.h
index 922a48b9b..a8c9feccf 100644
--- a/src/gpu/FieldIndexing.impl.h
+++ b/src/gpu/FieldIndexing.impl.h
@@ -44,17 +44,21 @@ FieldIndexing<T>::FieldIndexing ( const GPUField<T> & field,
 {
    WALBERLA_DEBUG_SECTION()
    {
-      gpuDeviceProp prop;
-      int count;
-      gpuGetDeviceCount(&count);
-      int threadsPerBlock = std::numeric_limits<int>::max();
-      for (int i = 0; i < count; i++) {
-         gpuGetDeviceProperties(&prop, i);
-         threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock );
+      WALBERLA_DEVICE_SECTION()
+      {
+         gpuDeviceProp prop;
+         int count;
+         gpuGetDeviceCount(&count);
+         int threadsPerBlock = std::numeric_limits< int >::max();
+         for (int i = 0; i < count; i++)
+         {
+            gpuGetDeviceProperties(&prop, i);
+            threadsPerBlock = std::min(prop.maxThreadsPerBlock, threadsPerBlock);
+         }
+         WALBERLA_ASSERT_LESS(int_c(blockDim_.x), threadsPerBlock,
+                              "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller "
+                                 << "than the maximal thread count per GPU block.")
       }
-      WALBERLA_ASSERT_LESS( int_c( blockDim_.x ), threadsPerBlock,
-                            "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " <<
-                            "than the maximal thread count per GPU block." )
    }
 }
 
diff --git a/src/gpu/FieldIndexing3D.impl.h b/src/gpu/FieldIndexing3D.impl.h
index a8cc922cf..5aa027872 100644
--- a/src/gpu/FieldIndexing3D.impl.h
+++ b/src/gpu/FieldIndexing3D.impl.h
@@ -52,17 +52,21 @@ FieldIndexing3D<T>::FieldIndexing3D( const GPUField<T> & field,
 {
    WALBERLA_DEBUG_SECTION()
    {
-      gpuDeviceProp prop;
-      int count;
-      gpuGetDeviceCount(&count);
-      int threadsPerBlock = std::numeric_limits<int>::max();
-      for (int i = 0; i < count; i++) {
-         gpuGetDeviceProperties(&prop, i);
-         threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock );
+      WALBERLA_DEVICE_SECTION()
+      {
+         gpuDeviceProp prop;
+         int count;
+         gpuGetDeviceCount(&count);
+         int threadsPerBlock = std::numeric_limits< int >::max();
+         for (int i = 0; i < count; i++)
+         {
+            gpuGetDeviceProperties(&prop, i);
+            threadsPerBlock = std::min(prop.maxThreadsPerBlock, threadsPerBlock);
+         }
+         WALBERLA_ASSERT_LESS(int_c(blockDim_.x), threadsPerBlock,
+                              "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller "
+                                 << "than the maximal thread count per GPU block.")
       }
-      WALBERLA_ASSERT_LESS( int_c( blockDim_.x ), threadsPerBlock,
-                            "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " <<
-                            "than the maximal thread count per GPU block." )
    }
 }
 
diff --git a/src/gpu/FieldIndexingXYZ.h b/src/gpu/FieldIndexingXYZ.h
index b6da50d5d..f62161bd1 100644
--- a/src/gpu/FieldIndexingXYZ.h
+++ b/src/gpu/FieldIndexingXYZ.h
@@ -21,6 +21,9 @@
 
 #pragma once
 
+#include "core/DataTypes.h"
+
+#include "DeviceWrapper.h"
 #include "FieldAccessorXYZ.h"
 
 namespace walberla { namespace cell {  class CellInterval;  } }
diff --git a/src/gpu/FieldIndexingXYZ.impl.h b/src/gpu/FieldIndexingXYZ.impl.h
index 9ec8b6c08..d75560b9d 100644
--- a/src/gpu/FieldIndexingXYZ.impl.h
+++ b/src/gpu/FieldIndexingXYZ.impl.h
@@ -41,17 +41,21 @@ FieldIndexingXYZ<T>::FieldIndexingXYZ ( const GPUField<T> & field,
 {
    WALBERLA_DEBUG_SECTION()
    {
-      gpuDeviceProp prop;
-      int count;
-      gpuGetDeviceCount(&count);
-      int threadsPerBlock = std::numeric_limits<int>::max();
-      for (int i = 0; i < count; i++) {
-         gpuGetDeviceProperties(&prop, i);
-         threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock );
+      WALBERLA_DEVICE_SECTION()
+      {
+         gpuDeviceProp prop;
+         int count;
+         gpuGetDeviceCount(&count);
+         int threadsPerBlock = std::numeric_limits< int >::max();
+         for (int i = 0; i < count; i++)
+         {
+            gpuGetDeviceProperties(&prop, i);
+            threadsPerBlock = std::min(prop.maxThreadsPerBlock, threadsPerBlock);
+         }
+         WALBERLA_ASSERT_LESS(int_c(blockDim_.x), threadsPerBlock,
+                              "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller "
+                                 << "than the maximal thread count per GPU block.")
       }
-      WALBERLA_ASSERT_LESS( int_c( blockDim_.x ), threadsPerBlock,
-                            "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " <<
-                            "than the maximal thread count per GPU block." )
    }
 }
 
diff --git a/src/gpu/GPUCopy.h b/src/gpu/GPUCopy.h
index e74d04ddb..08e94a38a 100644
--- a/src/gpu/GPUCopy.h
+++ b/src/gpu/GPUCopy.h
@@ -25,7 +25,8 @@
 
 #include "core/DataTypes.h"
 
-#include "ErrorChecking.h"
+#include "gpu/ErrorChecking.h"
+#include "gpu/DeviceWrapper.h"
 
 #include <tuple>
 
diff --git a/src/gpu/GPUField.h b/src/gpu/GPUField.h
index ecc9ccc5b..f8a0242ed 100755
--- a/src/gpu/GPUField.h
+++ b/src/gpu/GPUField.h
@@ -28,7 +28,7 @@
 
 #include "stencil/Directions.h"
 
-#include "gpu/GPUWrapper.h"
+#include "gpu/DeviceWrapper.h"
 
 namespace walberla {
 namespace gpu
@@ -40,7 +40,7 @@ namespace gpu
 
 
    //*******************************************************************************************************************
-   /*! GhostLayerField stored on a CUDA GPU
+   /*! GhostLayerField stored on a CUDA/HIP GPU
    *
    *  Basically a wrapper around a CUDA/HIP device pointer together with size information about the field
    *  i.e. sizes in x,y,z,f directions and number of ghost layers.
@@ -155,7 +155,7 @@ namespace gpu
       //****************************************************************************************************************
 
    protected:
-      gpuPitchedPtr pitchedPtr_;
+      gpuPitchedPtr  pitchedPtr_;
       uint_t         nrOfGhostLayers_;
       uint_t         xSize_;
       uint_t         ySize_;
diff --git a/src/gpu/GPUField.impl.h b/src/gpu/GPUField.impl.h
index 9c1242aa9..dd42d088c 100644
--- a/src/gpu/GPUField.impl.h
+++ b/src/gpu/GPUField.impl.h
@@ -22,6 +22,7 @@
 #include "GPUField.h"
 #include "ErrorChecking.h"
 #include "AlignedAllocation.h"
+#include "DeviceWrapper.h"
 #include "core/logging/Logging.h"
 
 namespace walberla {
@@ -36,40 +37,44 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz
      xSize_( _xSize), ySize_( _ySize ), zSize_( _zSize ), fSize_( _fSize ),
      layout_( _layout ), usePitchedMem_( usePitchedMem ), timestepCounter_(0)
 {
+   WALBERLA_NON_DEVICE_SECTION() {
+      WALBERLA_ABORT(__FUNCTION__ << "Instantiating GPU field without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.")
+   }
+
    gpuExtent extent;
-   if ( layout_ == zyxf )
+   if (layout_ == zyxf)
    {
       extent.width  = _fSize * sizeof(T);
-      extent.height = (_xSize + 2 * _nrOfGhostLayers );
-      extent.depth  = (_ySize + 2 * _nrOfGhostLayers ) * ( _zSize + 2 * _nrOfGhostLayers );
+      extent.height = (_xSize + 2 * _nrOfGhostLayers);
+      extent.depth  = (_ySize + 2 * _nrOfGhostLayers) * (_zSize + 2 * _nrOfGhostLayers);
    }
    else
    {
-      extent.width  = (_xSize + 2 * _nrOfGhostLayers ) * sizeof(T);
-      extent.height = (_ySize + 2 * _nrOfGhostLayers );
-      extent.depth  = (_zSize + 2 * _nrOfGhostLayers ) * _fSize;
+      extent.width  = (_xSize + 2 * _nrOfGhostLayers) * sizeof(T);
+      extent.height = (_ySize + 2 * _nrOfGhostLayers);
+      extent.depth  = (_zSize + 2 * _nrOfGhostLayers) * _fSize;
    }
 
-   if ( usePitchedMem_ )
+   if (usePitchedMem_)
    {
       size_t pitch;
       const size_t alignment = 256;
-      void * mem = allocate_pitched_with_offset( pitch, extent.width, extent.height * extent.depth, alignment,
-                                                 sizeof(T) * nrOfGhostLayers_ );
-      WALBERLA_ASSERT_EQUAL( size_t((char*)(mem) + sizeof(T) * nrOfGhostLayers_ ) % alignment, 0 )
-      pitchedPtr_ = make_gpuPitchedPtr( mem, pitch, extent.width, extent.height );
+      void* mem = allocate_pitched_with_offset(pitch, extent.width, extent.height * extent.depth, alignment,
+                                               sizeof(T) * nrOfGhostLayers_);
+      WALBERLA_ASSERT_EQUAL(size_t((char*) (mem) + sizeof(T) * nrOfGhostLayers_) % alignment, 0)
+      pitchedPtr_ = make_gpuPitchedPtr(mem, pitch, extent.width, extent.height);
    }
    else
    {
-      pitchedPtr_ = make_gpuPitchedPtr(nullptr, extent.width, extent.width, extent.height );
-      WALBERLA_GPU_CHECK ( gpuMalloc( &pitchedPtr_.ptr, extent.width * extent.height * extent.depth ) )
+      pitchedPtr_ = make_gpuPitchedPtr(nullptr, extent.width, extent.width, extent.height);
+      WALBERLA_GPU_CHECK(gpuMalloc(&pitchedPtr_.ptr, extent.width * extent.height * extent.depth))
    }
 
    // allocation size is stored in pitched pointer
    // pitched pointer stores the amount of padded region in bytes
    // but we keep track of the size in #elements
-   WALBERLA_ASSERT_EQUAL( pitchedPtr_.pitch % sizeof(T), 0 )
-   if ( layout_ == field::fzyx )
+   WALBERLA_ASSERT_EQUAL(pitchedPtr_.pitch % sizeof(T), 0)
+   if (layout_ == field::fzyx)
    {
       xAllocSize_ = pitchedPtr_.pitch / sizeof(T);
       fAllocSize_ = fSize_;
diff --git a/src/gpu/HostFieldAllocator.h b/src/gpu/HostFieldAllocator.h
index 2b7311add..8b24c3a47 100644
--- a/src/gpu/HostFieldAllocator.h
+++ b/src/gpu/HostFieldAllocator.h
@@ -22,7 +22,8 @@
 
 #pragma once
 
-#include "ErrorChecking.h"
+#include "gpu/ErrorChecking.h"
+#include "gpu/DeviceWrapper.h"
 #include "field/allocation/FieldAllocator.h"
 
 
@@ -51,22 +52,37 @@ namespace gpu
       virtual T * allocateMemory (  uint_t size0, uint_t size1, uint_t size2, uint_t size3,
                                     uint_t & allocSize1, uint_t & allocSize2, uint_t & allocSize3 )
       {
+         WALBERLA_NON_DEVICE_SECTION()
+         {
+            WALBERLA_ABORT(__FUNCTION__ << "Using GPU method without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.")
+         }
+
          allocSize1=size1;
          allocSize2=size2;
          allocSize3=size3;
-         void * result;
-         WALBERLA_GPU_CHECK( gpuHostAlloc( &result, size0*size1*size2*size3*sizeof(T), HostAllocFlags ) )
+         void * result = nullptr;
+         WALBERLA_GPU_CHECK(gpuHostAlloc(&result, size0 * size1 * size2 * size3 * sizeof(T), HostAllocFlags))
          return (T*)(result);
       }
 
       virtual T * allocateMemory ( uint_t size )
       {
-         T* result;
-         gpuHostAlloc( &result, size*sizeof(T), HostAllocFlags );
-         return result;
+         WALBERLA_NON_DEVICE_SECTION()
+         {
+            WALBERLA_ABORT(__FUNCTION__ << "Using GPU method without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.")
+         }
+
+         void * result = nullptr;
+         WALBERLA_GPU_CHECK(gpuHostAlloc(&result, size*sizeof(T), HostAllocFlags))
+         return (T*)(result);
       }
 
-      virtual void deallocate(T *& values) {WALBERLA_GPU_CHECK( gpuFreeHost( values ) )}
+      virtual void deallocate(T *& values) {
+         WALBERLA_NON_DEVICE_SECTION() {
+            WALBERLA_ABORT(__FUNCTION__ << "Using GPU method without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.")
+         }
+         WALBERLA_GPU_CHECK(gpuFreeHost(values))
+      }
    };
 
 
diff --git a/src/gpu/Kernel.h b/src/gpu/Kernel.h
index 6fca21052..f6c2eb687 100644
--- a/src/gpu/Kernel.h
+++ b/src/gpu/Kernel.h
@@ -236,7 +236,10 @@ namespace gpu
       // .. and launch the kernel
       static_assert( sizeof(void *) == sizeof(void (*)(void)),
                      "object pointer and function pointer sizes must be equal" );
-      WALBERLA_GPU_CHECK( gpuLaunchKernel( (void*) funcPtr_, gridDim_, blockDim_, args.data(), sharedMemSize_, stream ) )
+      WALBERLA_DEVICE_SECTION()
+      {
+         WALBERLA_GPU_CHECK(gpuLaunchKernel((void*) funcPtr_, gridDim_, blockDim_, args.data(), sharedMemSize_, stream))
+      }
    }
 
 
diff --git a/src/gpu/NVTX.h b/src/gpu/NVTX.h
index 46302f917..86e3e6a3a 100644
--- a/src/gpu/NVTX.h
+++ b/src/gpu/NVTX.h
@@ -21,11 +21,14 @@
 
 #include "core/DataTypes.h"
 
-#include <string>
+#include "DeviceWrapper.h"
 
-#include <nvToolsExt.h>
-#include <nvToolsExtCuda.h>
-#include <nvToolsExtCudaRt.h>
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+  #include <nvToolsExt.h>
+  #include <nvToolsExtCuda.h>
+  #include <nvToolsExtCudaRt.h>
+  #include <string>
+#endif
 
 namespace walberla{
 namespace gpu
@@ -33,20 +36,30 @@ namespace gpu
 
 inline void nvtxMarker(const std::string& name, const uint32_t color=0xaaaaaa)
 {
-    nvtxEventAttributes_t eventAttrib;
-    memset(&eventAttrib, 0, NVTX_EVENT_ATTRIB_STRUCT_SIZE);
-    eventAttrib.version = NVTX_VERSION;
-    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-    eventAttrib.colorType = NVTX_COLOR_ARGB;
-    eventAttrib.color = 0xFF000000 | color;
-    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
-    eventAttrib.message.ascii = name.c_str();
-    nvtxMarkEx(&eventAttrib);
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   nvtxEventAttributes_t eventAttrib;
+   memset(&eventAttrib, 0, NVTX_EVENT_ATTRIB_STRUCT_SIZE);
+   eventAttrib.version       = NVTX_VERSION;
+   eventAttrib.size          = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+   eventAttrib.colorType     = NVTX_COLOR_ARGB;
+   eventAttrib.color         = 0xFF000000 | color;
+   eventAttrib.messageType   = NVTX_MESSAGE_TYPE_ASCII;
+   eventAttrib.message.ascii = name.c_str();
+   nvtxMarkEx(&eventAttrib);
+#else
+    WALBERLA_UNUSED(name);
+    WALBERLA_UNUSED(color);
+#endif
 }
 
 inline void nameStream(const cudaStream_t & stream, const std::string & name)
 {
-    nvtxNameCudaStreamA(stream, name.c_str());
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   nvtxNameCudaStreamA(stream, name.c_str());
+#else
+   WALBERLA_UNUSED(stream);
+   WALBERLA_UNUSED(name);
+#endif
 }
 
 class NvtxRange
@@ -54,21 +67,31 @@ class NvtxRange
 public:
     NvtxRange(const std::string & name, const uint32_t color=0xaaaaaa)
     {
-        memset(&eventAttrib, 0, NVTX_EVENT_ATTRIB_STRUCT_SIZE);
-        eventAttrib.version = NVTX_VERSION;
-        eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-        eventAttrib.colorType = NVTX_COLOR_ARGB;
-        eventAttrib.color = 0xFF000000 | color;
-        eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
-        eventAttrib.message.ascii = name.c_str();
-        nvtxRangePushEx(&eventAttrib);
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+      memset(&eventAttrib, 0, NVTX_EVENT_ATTRIB_STRUCT_SIZE);
+      eventAttrib.version       = NVTX_VERSION;
+      eventAttrib.size          = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+      eventAttrib.colorType     = NVTX_COLOR_ARGB;
+      eventAttrib.color         = 0xFF000000 | color;
+      eventAttrib.messageType   = NVTX_MESSAGE_TYPE_ASCII;
+      eventAttrib.message.ascii = name.c_str();
+      nvtxRangePushEx(&eventAttrib);
+#else
+      WALBERLA_UNUSED(name);
+      WALBERLA_UNUSED(color);
+#endif
     }
+
     ~NvtxRange()
     {
-        nvtxRangePop();
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+         nvtxRangePop();
+#endif
     }
 private:
+#if defined(WALBERLA_BUILD_WITH_CUDA)
     nvtxEventAttributes_t eventAttrib;
+#endif
 };
 
 
diff --git a/src/gpu/ParallelStreams.cpp b/src/gpu/ParallelStreams.cpp
index 2dffc7f0a..aed66f693 100644
--- a/src/gpu/ParallelStreams.cpp
+++ b/src/gpu/ParallelStreams.cpp
@@ -21,6 +21,7 @@
 
 
 #include "gpu/ParallelStreams.h"
+#include "gpu/DeviceWrapper.h"
 
 namespace walberla {
 namespace gpu
@@ -30,32 +31,40 @@ namespace gpu
    ParallelSection::ParallelSection(ParallelStreams * parent, gpuStream_t mainStream)
      : parent_( parent ), mainStream_( mainStream ), counter_( 0 )
    {
-      WALBERLA_GPU_CHECK( gpuEventCreate(&startEvent_) )
-      WALBERLA_GPU_CHECK( gpuEventRecord( startEvent_, mainStream_ ) )
+      WALBERLA_DEVICE_SECTION()
+      {
+         WALBERLA_GPU_CHECK(gpuEventCreate(&startEvent_))
+         WALBERLA_GPU_CHECK(gpuEventRecord(startEvent_, mainStream_))
+      }
    }
 
    ParallelSection::~ParallelSection()
    {
-      synchronize();
-      WALBERLA_GPU_CHECK( gpuEventDestroy(startEvent_) )
+      WALBERLA_DEVICE_SECTION()
+      {
+         synchronize();
+         WALBERLA_GPU_CHECK( gpuEventDestroy(startEvent_) )
+      }
    }
 
    void ParallelSection::next()
    {
-      if( counter_ > 0 ) {
-         WALBERLA_GPU_CHECK( gpuEventRecord( parent_->events_[counter_ - 1], parent_->sideStreams_[counter_ - 1] ) )
-      }
-      else {
-         WALBERLA_GPU_CHECK( gpuEventRecord( parent_->mainEvent_, mainStream_ ) )
-      }
-      ++counter_;
+      WALBERLA_DEVICE_SECTION()
+      {
+         if (counter_ > 0)
+         {
+            WALBERLA_GPU_CHECK(gpuEventRecord(parent_->events_[counter_ - 1], parent_->sideStreams_[counter_ - 1]))
+         }
+         else { WALBERLA_GPU_CHECK(gpuEventRecord(parent_->mainEvent_, mainStream_)) }
+         ++counter_;
 
-      parent_->ensureSize( counter_ );
+         parent_->ensureSize(counter_);
 
-      WALBERLA_GPU_CHECK( gpuStreamWaitEvent( stream(), startEvent_, 0 ))
+         WALBERLA_GPU_CHECK(gpuStreamWaitEvent(stream(), startEvent_, 0))
+      }
    }
 
-   void ParallelSection::run(const std::function<void( gpuStream_t)> & f)
+   void ParallelSection::run(const std::function<void(gpuStream_t)> & f)
    {
       f( stream() );
       next();
@@ -63,18 +72,20 @@ namespace gpu
 
    void ParallelSection::synchronize()
    {
-      for( uint_t i=0; i < counter_; ++i )
-         for( uint_t j=0; j < counter_; ++j )
-         {
-            if( i == j )
-               continue;
+      WALBERLA_DEVICE_SECTION()
+      {
+         for (uint_t i = 0; i < counter_; ++i)
+            for (uint_t j = 0; j < counter_; ++j)
+            {
+               if (i == j) continue;
 
-            auto & event  = i == 0 ? parent_->mainEvent_ : parent_->events_[i - 1];
-            gpuStream_t stream = j == 0 ? mainStream_ : parent_->sideStreams_[j - 1];
-            WALBERLA_GPU_CHECK( gpuStreamWaitEvent( stream, event, 0 ))
-         }
+               auto& event        = i == 0 ? parent_->mainEvent_ : parent_->events_[i - 1];
+               gpuStream_t stream = j == 0 ? mainStream_ : parent_->sideStreams_[j - 1];
+               WALBERLA_GPU_CHECK(gpuStreamWaitEvent(stream, event, 0))
+            }
 
-      WALBERLA_GPU_CHECK( gpuEventRecord( startEvent_, mainStream_ ) )
+         WALBERLA_GPU_CHECK(gpuEventRecord(startEvent_, mainStream_))
+      }
    }
 
    gpuStream_t ParallelSection::stream()
diff --git a/src/gpu/communication/UniformGPUScheme.impl.h b/src/gpu/communication/UniformGPUScheme.impl.h
index 28033d146..8a8616c1e 100644
--- a/src/gpu/communication/UniformGPUScheme.impl.h
+++ b/src/gpu/communication/UniformGPUScheme.impl.h
@@ -321,7 +321,7 @@ namespace communication {
    }
 
    template< typename Stencil >
-   std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor(cudaStream_t stream)
+   std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor(gpuStream_t stream)
    {
       return [this, stream]() { wait( stream ); };
    }
diff --git a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h
index 4a082d341..8d95855e6 100644
--- a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h
+++ b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h
@@ -80,7 +80,7 @@ class BasicRecursiveTimeStepGPU
    ~BasicRecursiveTimeStepGPU() = default;
 
    void operator()() { timestep(0); };
-   void addRefinementToTimeLoop(timeloop::SweepTimeloop& timeloop, uint_t level = 0);
+   void addRefinementToTimeLoop(SweepTimeloop& timeloop, uint_t level = 0);
    void test(uint_t maxLevel, uint_t level = 0);
 
  private:
diff --git a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h
index f7c5b2878..5bb43c3c8 100644
--- a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h
+++ b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h
@@ -94,7 +94,7 @@ void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollectio
 
 
 template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
-void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::addRefinementToTimeLoop(timeloop::SweepTimeloop & timeloop, uint_t level)
+void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::addRefinementToTimeLoop(SweepTimeloop & timeloop, uint_t level)
 {
    // 1.1 Collision
    timeloop.addFuncBeforeTimeStep(executeStreamCollideOnLevel(level), "Refinement Cycle: streamCollide on level " + std::to_string(level));
diff --git a/src/lbm_generated/refinement/BasicRecursiveTimeStep.h b/src/lbm_generated/refinement/BasicRecursiveTimeStep.h
index 6b0a2a7ec..7c6fdc828 100644
--- a/src/lbm_generated/refinement/BasicRecursiveTimeStep.h
+++ b/src/lbm_generated/refinement/BasicRecursiveTimeStep.h
@@ -71,7 +71,7 @@ class BasicRecursiveTimeStep
      };
 
    void operator() () { timestep(0); };
-   void addRefinementToTimeLoop(timeloop::SweepTimeloop & timeloop, uint_t level=0);
+   void addRefinementToTimeLoop(SweepTimeloop & timeloop, uint_t level=0);
 
  private:
    void timestep(uint_t level);
diff --git a/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h
index 7e6d9b594..29f7de765 100644
--- a/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h
+++ b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h
@@ -90,7 +90,7 @@ void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T
 
 
 template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
-void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::addRefinementToTimeLoop(timeloop::SweepTimeloop & timeloop, uint_t level)
+void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::addRefinementToTimeLoop(SweepTimeloop & timeloop, uint_t level)
 {
    // 1.1 Collision
    timeloop.addFuncBeforeTimeStep(executeStreamCollideOnLevel(level), "Refinement Cycle: streamCollide on level " + std::to_string(level));
diff --git a/src/timeloop/CMakeLists.txt b/src/timeloop/CMakeLists.txt
index 9035c3d9a..ba2ef178c 100644
--- a/src/timeloop/CMakeLists.txt
+++ b/src/timeloop/CMakeLists.txt
@@ -14,8 +14,6 @@ target_sources( timeloop
       SweepTimeloop.h
       Timeloop.h
       PerformanceMeter.cpp
-      SweepTimeloop.cpp
-      Timeloop.cpp
 )
 
 ###################################################################################################
diff --git a/src/timeloop/SelectableFunctionCreators.h b/src/timeloop/SelectableFunctionCreators.h
index b014f44de..b877b293e 100644
--- a/src/timeloop/SelectableFunctionCreators.h
+++ b/src/timeloop/SelectableFunctionCreators.h
@@ -186,7 +186,7 @@ namespace timeloop {
 
 
    private:
-      friend class SweepTimeloop;
+      template < typename TimingPolicy > friend class SweepTimeloop;
 
       BlockStorage & bs_;
 
diff --git a/src/timeloop/SweepTimeloop.h b/src/timeloop/SweepTimeloop.h
index 4ffc8df9e..19e9344a6 100644
--- a/src/timeloop/SweepTimeloop.h
+++ b/src/timeloop/SweepTimeloop.h
@@ -112,7 +112,8 @@ namespace timeloop {
     * \ingroup timeloop
     */
    //*******************************************************************************************************************
-   class SweepTimeloop : public Timeloop
+   template < typename TP = timing::WcPolicy>
+   class SweepTimeloop : public Timeloop<TP>
    {
    public:
 
@@ -121,11 +122,11 @@ namespace timeloop {
       //@{
 
       SweepTimeloop( BlockStorage & blockStorage, uint_t nrOfTimeSteps )
-         : Timeloop(nrOfTimeSteps), blockStorage_(blockStorage), nextId_(0),firstRun_(true)
+         : Timeloop<TP>(nrOfTimeSteps), blockStorage_(blockStorage), nextId_(0),firstRun_(true)
       {}
 
       SweepTimeloop( const shared_ptr<StructuredBlockStorage> & structuredBlockStorage, uint_t nrOfTimeSteps )
-         : Timeloop(nrOfTimeSteps), blockStorage_( structuredBlockStorage->getBlockStorage() ),
+         : Timeloop<TP>(nrOfTimeSteps), blockStorage_( structuredBlockStorage->getBlockStorage() ),
            nextId_(0), firstRun_(true)
       {}
 
@@ -167,7 +168,7 @@ namespace timeloop {
       }
 
       void doTimeStep(const Set<SUID> &selectors) override;
-      void doTimeStep(const Set<SUID> &selectors, WcTimingPool &tp) override;
+      void doTimeStep(const Set<SUID> &selectors, timing::TimingPool<TP> &tp) override;
 
       uint_t nextId_;
       std::vector<uint_t> sweepsToDelete_;
@@ -180,7 +181,7 @@ namespace timeloop {
 } // namespace timeloop
 } // namespace walberla
 
-
+#include "SweepTimeloop.impl.h"
 
 //======================================================================================================================
 //
@@ -189,7 +190,8 @@ namespace timeloop {
 //======================================================================================================================
 
 namespace walberla {
-   using timeloop::SweepTimeloop;
+   using SweepTimeloop = typename timeloop::SweepTimeloop < >;
+   using DeviceSynchronizeSweepTimeloop = typename timeloop::SweepTimeloop < timing::DeviceSynchronizePolicy >;
 
    using timeloop::Sweep;
    using timeloop::SweepOnBlock;
diff --git a/src/timeloop/SweepTimeloop.cpp b/src/timeloop/SweepTimeloop.impl.h
similarity index 90%
rename from src/timeloop/SweepTimeloop.cpp
rename to src/timeloop/SweepTimeloop.impl.h
index 5721c51c7..481ddbaca 100644
--- a/src/timeloop/SweepTimeloop.cpp
+++ b/src/timeloop/SweepTimeloop.impl.h
@@ -32,8 +32,8 @@ namespace timeloop {
 //////////////////////////   Execution of Timeloop  ////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-
-void SweepTimeloop::doTimeStep(const Set<SUID> &selectors)
+template < typename TP >
+void SweepTimeloop<TP>::doTimeStep(const Set<SUID> &selectors)
 {
    removeForDeletionMarkedSweeps();
    //iterate over all registered sweeps
@@ -43,7 +43,7 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors)
 
       //select and execute before functions
       for( size_t j=0; j < s.beforeFuncs.size(); ++j )
-         executeSelectable(s.beforeFuncs[j].selectableFunc_,selectors,"Pre-Sweep Function");
+         this->executeSelectable(s.beforeFuncs[j].selectableFunc_, selectors, "Pre-Sweep Function");
 
       // Loop over all blocks
       for( BlockStorage::iterator bi = blockStorage_.begin(); bi != blockStorage_.end(); ++bi )
@@ -81,11 +81,12 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors)
 
       // select and execute after functions
       for( size_t j=0; j < s.afterFuncs.size(); ++j )
-         executeSelectable(s.afterFuncs[j].selectableFunc_,selectors,"Post-Sweep Function");
+         this->executeSelectable(s.afterFuncs[j].selectableFunc_, selectors, "Post-Sweep Function");
    }
 }
 
-void SweepTimeloop::doTimeStep(const Set<SUID> &selectors, WcTimingPool &timing)
+template < typename TP >
+void SweepTimeloop<TP>::doTimeStep(const Set<SUID> &selectors, timing::TimingPool<TP> &timing)
 {
    removeForDeletionMarkedSweeps();
    // On first run we extract all possible names of sweeps, independent of selectors
@@ -113,7 +114,7 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors, WcTimingPool &timing)
 
       //select and execute before functions
       for( size_t j=0; j < s.beforeFuncs.size(); ++j )
-         executeSelectable( s.beforeFuncs[j].selectableFunc_, selectors, "Pre-Sweep Function", timing );
+         this->executeSelectable( s.beforeFuncs[j].selectableFunc_, selectors, "Pre-Sweep Function", timing );
 
       for( BlockStorage::iterator bi = blockStorage_.begin(); bi != blockStorage_.end(); ++bi )
       {
@@ -149,7 +150,7 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors, WcTimingPool &timing)
 
       // select and execute after functions
       for( size_t j=0; j < s.afterFuncs.size(); ++j )
-         executeSelectable(s.afterFuncs[j].selectableFunc_,selectors,"Post-Sweep Function", timing );
+         this->executeSelectable(s.afterFuncs[j].selectableFunc_,selectors,"Post-Sweep Function", timing );
    }
 }
 
diff --git a/src/timeloop/Timeloop.h b/src/timeloop/Timeloop.h
index 48b7de6ce..faed83b06 100644
--- a/src/timeloop/Timeloop.h
+++ b/src/timeloop/Timeloop.h
@@ -48,6 +48,8 @@ using VoidFctNoArguments = std::function<void ()>;
 * \ingroup timeloop
 */
 //*******************************************************************************************************************
+
+template < typename TP = timing::WcPolicy >
 class Timeloop : public ITimeloop
 {
 private:
@@ -109,7 +111,10 @@ public:
    //**Construction & Destruction************************************************************************************
    /*! \name Construction & Destruction */
    //@{
-   Timeloop( uint_t nrOfTimeSteps );
+   Timeloop( uint_t nrOfTimeSteps )
+      : curTimeStep_(0), nrOfTimeSteps_(nrOfTimeSteps), stop_( false )
+   {
+   }
 
    ~Timeloop() override = default;
    //@}
@@ -121,17 +126,17 @@ public:
    //@{
    void run() override                  { run(true); }
    void run( const bool logTimeStep );
-   void run( WcTimingPool & timing, const bool logTimeStep = true );
+   void run( timing::TimingPool<TP> & timing, const bool logTimeStep = true );
 
    void singleStep() override { singleStep(true); }
    void singleStep( const bool logTimeStep );
-   void singleStep( WcTimingPool & timing, const bool logTimeStep = true );
+   void singleStep( timing::TimingPool<TP> & timing, const bool logTimeStep = true );
 
    void stop() override;
    void synchronizedStop( bool stop ) override;
 
     void setCurrentTimeStepToZero()     { curTimeStep_ = 0;  }
-    void setCurrentTimeStep( uint_t ts) override { curTimeStep_ = ts; }
+    void setCurrentTimeStep( uint_t ts ) override { curTimeStep_ = ts; }
 
     //@}
    //****************************************************************************************************************
@@ -183,7 +188,7 @@ public:
 protected:
 
    virtual void doTimeStep(const Set<SUID> &selectors) = 0;
-   virtual void doTimeStep(const Set<SUID> &selectors, WcTimingPool &timing) = 0;
+   virtual void doTimeStep(const Set<SUID> &selectors, timing::TimingPool<TP> &timing) = 0;
 
 
    void executeSelectable(const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable,
@@ -192,7 +197,7 @@ protected:
    void executeSelectable(const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable,
                           const Set<SUID> & selector,
                           const std::string & what,
-                          WcTimingPool & tp);
+                          timing::TimingPool<TP> & tp);
 
 
    uint_t curTimeStep_;   ///< current time step
@@ -210,6 +215,8 @@ protected:
 } // namespace timeloop
 } // namespace walberla
 
+#include "Timeloop.impl.h"
+
 
 
 //======================================================================================================================
@@ -219,6 +226,7 @@ protected:
 //======================================================================================================================
 
 namespace walberla {
-   using timeloop::Timeloop;
+   using Timeloop = typename timeloop::Timeloop < >;
+   using DeviceSynchronizeTimeloop = typename timeloop::Timeloop < timing::DeviceSynchronizePolicy >;
 }
 
diff --git a/src/timeloop/Timeloop.cpp b/src/timeloop/Timeloop.impl.h
similarity index 83%
rename from src/timeloop/Timeloop.cpp
rename to src/timeloop/Timeloop.impl.h
index fd46e16c1..832f1c7ad 100644
--- a/src/timeloop/Timeloop.cpp
+++ b/src/timeloop/Timeloop.impl.h
@@ -29,16 +29,8 @@
 namespace walberla {
 namespace timeloop {
 
-
-Timeloop::Timeloop( uint_t nrOfTimeSteps)
-   : curTimeStep_(0), nrOfTimeSteps_(nrOfTimeSteps), stop_( false )
-{
-}
-
-
-
-
-void Timeloop::run( const bool logTimeStep )
+template < typename TP >
+void Timeloop<TP>::run( const bool logTimeStep )
 {
    WALBERLA_LOG_PROGRESS( "Running timeloop for " << nrOfTimeSteps_ << " time steps" )
    while(curTimeStep_ < nrOfTimeSteps_) {
@@ -51,7 +43,8 @@ void Timeloop::run( const bool logTimeStep )
    WALBERLA_LOG_PROGRESS( "Timeloop finished" )
 }
 
-void Timeloop::run( WcTimingPool & tp, const bool logTimeStep )
+template < typename TP >
+void Timeloop<TP>::run(timing::TimingPool<TP> & tp, const bool logTimeStep )
 {
    WALBERLA_LOG_PROGRESS( "Running timeloop for " << nrOfTimeSteps_ << " time steps" )
 
@@ -74,7 +67,9 @@ void Timeloop::run( WcTimingPool & tp, const bool logTimeStep )
 *  before reaching nrOfTimeSteps
 */
 //*******************************************************************************************************************
-void Timeloop::stop()
+
+template < typename TP >
+void Timeloop<TP>::stop()
 {
    stop_ = true;
 }
@@ -89,13 +84,16 @@ void Timeloop::stop()
 *     -> If at least on process calls synchronizedStop(true) the timeloop is stopped
 */
 //*******************************************************************************************************************
-void Timeloop::synchronizedStop( bool stopVal )
+
+template < typename TP >
+void Timeloop<TP>::synchronizedStop( bool stopVal )
 {
    stop_ = stopVal;
    mpi::allReduceInplace( stop_, mpi::LOGICAL_OR );
 }
 
-void Timeloop::singleStep( const bool logTimeStep )
+template < typename TP >
+void Timeloop<TP>::singleStep( const bool logTimeStep )
 {
    LoggingStampManager const raii( make_shared<LoggingStamp>( *this ), logTimeStep );
 
@@ -112,7 +110,8 @@ void Timeloop::singleStep( const bool logTimeStep )
    ++curTimeStep_;
 }
 
-void Timeloop::singleStep( WcTimingPool & tp, const bool logTimeStep )
+template < typename TP >
+void Timeloop<TP>::singleStep( timing::TimingPool<TP> & tp, const bool logTimeStep )
 {
    LoggingStampManager const raii( make_shared<LoggingStamp>( *this ), logTimeStep );
 
@@ -133,17 +132,17 @@ void Timeloop::singleStep( WcTimingPool & tp, const bool logTimeStep )
 //////////////////////////////////////////   Registering Functions   ///////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-
-Timeloop::FctHandle
-Timeloop::addFuncBeforeTimeStep(const VoidFctNoArguments& f, const std::string & id,
+template < typename TP >
+typename Timeloop<TP>::FctHandle
+Timeloop<TP>::addFuncBeforeTimeStep(const VoidFctNoArguments& f, const std::string & id,
                                 const Set<SUID> & r, const Set<SUID> & e )
 {
     beforeFunctions_.emplace_back(f,r,e,id );
     return beforeFunctions_.size() - 1;
 }
 
-
-void Timeloop::addFuncBeforeTimeStep(const Timeloop::FctHandle & h,
+template < typename TP >
+void Timeloop<TP>::addFuncBeforeTimeStep(const Timeloop::FctHandle & h,
                                      const VoidFctNoArguments& f, const std::string & id,
                                      const Set<SUID>&r, const Set<SUID> & e )
 {
@@ -152,17 +151,17 @@ void Timeloop::addFuncBeforeTimeStep(const Timeloop::FctHandle & h,
 }
 
 
-
-Timeloop::FctHandle
-Timeloop::addFuncAfterTimeStep(const VoidFctNoArguments& f, const std::string & id,
+template < typename TP >
+typename Timeloop<TP>::FctHandle
+Timeloop<TP>::addFuncAfterTimeStep(const VoidFctNoArguments& f, const std::string & id,
                                       const Set<SUID> & r, const Set<SUID> & e )
 {
     afterFunctions_.emplace_back(f,r,e,id );
     return afterFunctions_.size() - 1;
 }
 
-
-void Timeloop::addFuncAfterTimeStep(const Timeloop::FctHandle & h,
+template < typename TP >
+void Timeloop<TP>::addFuncAfterTimeStep(const Timeloop::FctHandle & h,
                                            const VoidFctNoArguments& f, const std::string & id,
                                            const Set<SUID>&r, const Set<SUID> & e )
 {
@@ -172,8 +171,8 @@ void Timeloop::addFuncAfterTimeStep(const Timeloop::FctHandle & h,
 
 
 
-
-void Timeloop::executeSelectable( const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable,
+template < typename TP >
+void Timeloop<TP>::executeSelectable( const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable,
                                   const Set<SUID> & selector,
                                   const std::string & what )
 {
@@ -192,10 +191,11 @@ void Timeloop::executeSelectable( const selectable::SetSelectableObject<VoidFctN
    LIKWID_MARKER_STOP( objectName.c_str() );
 }
 
-void Timeloop::executeSelectable( const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable,
+template < typename TP >
+void Timeloop<TP>::executeSelectable( const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable,
                                   const Set<SUID> & selector,
                                   const std::string & what,
-                                  WcTimingPool & timing )
+                                  timing::TimingPool<TP> & timing )
 {
    std::string objectName;
    const VoidFctNoArguments * exe = selectable.getUnique( selector, objectName );
diff --git a/tests/pe/Refinement.cpp b/tests/pe/Refinement.cpp
index 943526d15..96900a625 100644
--- a/tests/pe/Refinement.cpp
+++ b/tests/pe/Refinement.cpp
@@ -167,7 +167,7 @@ int main( int argc, char ** argv )
    createSphere(*globalStorage.get(), forest->getBlockStorage(), storageID, 0, Vec3(5,5,5), 1);
    createSphere(*globalStorage.get(), forest->getBlockStorage(), storageID, 0, Vec3(15,6,6), 1);
 
-   timeloop::SweepTimeloop timeloop( forest->getBlockStorage(), 1 );
+   SweepTimeloop timeloop( forest->getBlockStorage(), 1 );
    timeloop.addFuncBeforeTimeStep( simpleLB, "refreshFunctorName" );
 
    for (int i = 0; i < 1; ++i)
-- 
GitLab