diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt
index e6799acb7b4247371d9979f175b3064b873e1f43..c5b09184bacb697491ffea3b1782ad84cbe3efd9 100644
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -29,9 +29,8 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
    endif()
 
    if(WALBERLA_BUILD_WITH_GPU_SUPPORT)
-      add_subdirectory(CommunicationGPU)
-
       if ( WALBERLA_BUILD_WITH_CODEGEN )
+         add_subdirectory( CommunicationGPU )
          add_subdirectory( UniformGridGPU )
          add_subdirectory( NonUniformGridGPU )
       endif()
diff --git a/apps/benchmarks/CommunicationGPU/CMakeLists.txt b/apps/benchmarks/CommunicationGPU/CMakeLists.txt
index 229e0a4a4164f5c5480c161dfb10fa134493c00a..d45a2b7967cd83d740eadea77faa18fdd83fbd3e 100644
--- a/apps/benchmarks/CommunicationGPU/CMakeLists.txt
+++ b/apps/benchmarks/CommunicationGPU/CMakeLists.txt
@@ -3,5 +3,18 @@ waLBerla_link_files_to_builddir( "*.dat" )
 waLBerla_link_files_to_builddir( "*.py" )
 
 
-waLBerla_add_executable ( NAME CommunicationGPU
-        DEPENDS blockforest core domain_decomposition field gpu postprocessing sqlite python_coupling )
+waLBerla_generate_target_from_python(NAME CommunicationGPUGenerated
+        FILE CommunicationGPUCodeGen.py
+        OUT_FILES StorageSpec.h StorageSpec.${CODEGEN_FILE_SUFFIX}
+        CommunicationGPUInfoHeader.h
+        )
+
+waLBerla_add_executable (
+        NAME CommunicationGPU
+        FILES CommunicationGPU.cpp
+        DEPENDS blockforest core domain_decomposition field gpu postprocessing sqlite python_coupling CommunicationGPUGenerated )
+
+waLBerla_add_executable (
+        NAME GPUPackPerformance
+        FILES GPUPackPerformance.cpp
+        DEPENDS blockforest core domain_decomposition field gpu postprocessing sqlite python_coupling CommunicationGPUGenerated )
diff --git a/apps/benchmarks/CommunicationGPU/CommunicationGPU.cpp b/apps/benchmarks/CommunicationGPU/CommunicationGPU.cpp
index 32a504072d5e776ac9e0a89bd1669a636cb6416d..1cee36f1e76fa563b333b332f5b93497ede14b86 100644
--- a/apps/benchmarks/CommunicationGPU/CommunicationGPU.cpp
+++ b/apps/benchmarks/CommunicationGPU/CommunicationGPU.cpp
@@ -28,33 +28,34 @@
 #include "core/mpi/all.h"
 #include "core/waLBerlaBuildInfo.h"
 
-#include "field/AddToStorage.h"
-
-#include "gpu/AddGPUFieldToStorage.h"
 #include "gpu/DeviceSelectMPI.h"
 #include "gpu/FieldCopy.h"
-#include "gpu/GPUField.h"
 #include "gpu/GPUWrapper.h"
 #include "gpu/HostFieldAllocator.h"
 #include "gpu/ParallelStreams.h"
 #include "gpu/communication/UniformGPUScheme.h"
 #include "gpu/communication/MemcpyPackInfo.h"
 
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/gpu/AddToStorage.h"
+#include "lbm_generated/gpu/GPUPdfField.h"
+#include "lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h"
+
 #include "python_coupling/CreateConfig.h"
 #include "python_coupling/PythonCallback.h"
 
-#include "stencil/D3Q27.h"
-
 #include "sqlite/SQLite.h"
 
-#include <cmath>
+#include "CommunicationGPUInfoHeader.h"
 
 using namespace walberla;
 using gpu::communication::UniformGPUScheme;
-using Field_T = field::GhostLayerField<real_t, 1>;
-using GPUField_T = gpu::GPUField<real_t>;
+using StorageSpecification_T = lbm::StorageSpec;
+using Stencil_T = StorageSpecification_T::Stencil;
 
-using Stencil_T = stencil::D3Q27;
+using Field_T = lbm_generated::PdfField< StorageSpecification_T >;
+using GPUField_T = lbm_generated::GPUPdfField< StorageSpecification_T >;
 
 std::string fromEnv(const char *envVar) {
     auto env = std::getenv(envVar);
@@ -95,6 +96,7 @@ int main(int argc, char **argv) {
 
         const real_t timeForBenchmark = runCfg.getParameter<real_t>("timeForBenchmark", real_t(-1.0));
         const uint_t outerIterations = runCfg.getParameter<uint_t>("outerIterations", 2);
+        const std::string packInfoName = runCfg.getParameter<std::string>("pachinfo", "UniformGeneratedGPUPdfPackInfo");
 
         field::Layout layout;
         if (layoutStr == "fzyx")
@@ -127,12 +129,10 @@ int main(int argc, char **argv) {
                                                           false // keepGlobalBlockInformation
         );
 
-        auto rank = mpiManager->rank();
-
+        const StorageSpecification_T StorageSpec = StorageSpecification_T();
         auto allocator = make_shared<gpu::HostFieldAllocator<real_t> >(); // use pinned memory allocator for faster CPU-GPU memory transfers
-        const BlockDataID fieldCPU = field::addToStorage<Field_T>(blocks, "field", real_c(rank), layout, uint_c(1),
-                                                                  allocator);
-        const BlockDataID fieldGPU = gpu::addGPUFieldToStorage<Field_T>(blocks, fieldCPU, "field GPU", true);
+        const BlockDataID fieldCPU  = lbm_generated::addPdfFieldToStorage(blocks, "field", StorageSpec, uint_c(1), layout, allocator);
+        const BlockDataID fieldGPU = lbm_generated::addGPUPdfFieldToStorage< Field_T >(blocks, fieldCPU, StorageSpec, "field GPU", true);
 
         gpu::fieldCpy<GPUField_T, Field_T>(blocks, fieldGPU, fieldCPU);
 
@@ -141,14 +141,40 @@ int main(int argc, char **argv) {
         ///                                           COMMUNICATION SCHEME                                             ///
         //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
+        std::shared_ptr<gpu::GeneratedGPUPackInfo> packInfo;
+        if(packInfoName == "MemcpyPackInfo")
+        {
+            packInfo = std::make_shared<gpu::communication::MemcpyPackInfo<GPUField_T >>(fieldGPU);
+        }
+        else if (packInfoName == "UniformGeneratedGPUPdfPackInfo")
+        {
+            packInfo = std::make_shared<lbm_generated::UniformGeneratedGPUPdfPackInfo< GPUField_T >>(fieldGPU);
+        }
+        else
+        {
+            WALBERLA_ABORT("packInfoName does not exist")
+        }
+
+
         UniformGPUScheme<Stencil_T> communication(blocks, gpuDirectComm);
-        auto packInfo = std::make_shared<gpu::communication::MemcpyPackInfo<GPUField_T >>(fieldGPU);
+
         communication.addPackInfo(packInfo);
 
-        auto communicate = communication.getCommunicateFunctor();
         auto commStart = communication.getStartCommunicateFunctor();
         auto commWait = communication.getWaitFunctor();
 
+
+        WALBERLA_ROOT_SECTION()
+        {
+            for (auto &block : *blocks)
+            {
+                for( auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir )
+                {
+                    WALBERLA_LOG_INFO("For direction: " << stencil::dirToString[*dir] << " " << packInfo->size(*dir, &block) << " bytes will be communicated")
+                }
+            }
+        }
+
         //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
         ///                                          TIME STEP DEFINITIONS                                             ///
         //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -178,30 +204,30 @@ int main(int argc, char **argv) {
 
         mpi::broadcastObject(iterations);
 
-        WcTimingPool timingPool;
+        auto timingPool = std::make_shared<WcTimingPool>();
+        communication.enableTiming(timingPool);
         WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
-        WALBERLA_LOG_INFO_ON_ROOT("Running " << outerIterations << " outer iterations of size " << iterations);
+        WALBERLA_LOG_INFO_ON_ROOT("Running " << outerIterations << " outer iterations of size " << iterations)
+
         for (uint_t outerCtr = 0; outerCtr < outerIterations; ++outerCtr) {
-            timingPool["totalTime"].start();
             for (uint_t ctr = 0; ctr < iterations; ++ctr) {
-                timingPool["commStart"].start();
+                timingPool->operator[]("commStart").start();
                 commStart();
                 WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
-                timingPool["commStart"].end();
+                timingPool->operator[]("commStart").end();
 
-                timingPool["commWait"].start();
+                timingPool->operator[]("commWait").start();
                 commWait();
                 WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
-                timingPool["commWait"].end();
+                timingPool->operator[]("commWait").end();
             }
-            timingPool["totalTime"].end();
         }
 
         WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
 
-        auto reducedTimingPool = timingPool.getReduced(timing::REDUCE_TOTAL, 0);
+        auto reducedTimingPool = timingPool->getReduced(timing::REDUCE_TOTAL, 0);
         WALBERLA_ROOT_SECTION() {
-            WALBERLA_LOG_RESULT(*reducedTimingPool);
+            WALBERLA_LOG_RESULT(*reducedTimingPool)
 
             std::map<std::string, walberla::int64_t> integerProperties;
             std::map<std::string, double> realProperties;
@@ -213,9 +239,8 @@ int main(int argc, char **argv) {
                     stringProperties[it->first] = it->second;
             }
 
-            realProperties["total_min"] = real_c(timingPool["totalTime"].min()) / real_c(iterations);
-            realProperties["total_avg"] = real_c(timingPool["totalTime"].average() / real_c(iterations));
-            realProperties["total_max"] = real_c(timingPool["totalTime"].max() / real_c(iterations));
+            realProperties["total_min"] = real_c(timingPool->operator[]("totalTime").min()) / real_c(iterations);
+            realProperties["total_max"] = real_c(timingPool->operator[]("totalTime").max() / real_c(iterations));
 
             integerProperties["cellsPerBlock0"] = int64_c(cellsPerBlock[0]);
             integerProperties["cellsPerBlock1"] = int64_c(cellsPerBlock[1]);
@@ -237,6 +262,7 @@ int main(int argc, char **argv) {
             integerProperties["outerIterations"] = int64_c(outerIterations);
 
             stringProperties["layout"] = layoutStr;
+            stringProperties["packInfoName"] = packInfoName;
 
             stringProperties["SLURM_CLUSTER_NAME"] = fromEnv("SLURM_CLUSTER_NAME");
             stringProperties["SLURM_CPUS_ON_NODE"] = fromEnv("SLURM_CPUS_ON_NODE");
@@ -258,7 +284,7 @@ int main(int argc, char **argv) {
             stringProperties["compilerFlags"] = std::string(WALBERLA_COMPILER_FLAGS);
 
             auto runId = sqlite::storeRunInSqliteDB(databaseFile, integerProperties, stringProperties, realProperties);
-            sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, timingPool, "TimingRoot");
+            sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, *timingPool, "TimingRoot");
             sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, *reducedTimingPool, "TimingReduced");
         }
     }
diff --git a/apps/benchmarks/CommunicationGPU/CommunicationGPU.py b/apps/benchmarks/CommunicationGPU/CommunicationGPU.py
index 4de22a6ecf86be8a4a2ca841b4064746943f87b3..271e943e6e17280b136536375c76934cc09904e2 100644
--- a/apps/benchmarks/CommunicationGPU/CommunicationGPU.py
+++ b/apps/benchmarks/CommunicationGPU/CommunicationGPU.py
@@ -2,12 +2,13 @@ import os
 import waLBerla as wlb
 
 DB_FILE = os.environ.get('DB_FILE', "CommunicationGPU.sqlite3")
+LOGLEVEL = 'info'
 
 
 class Scenario:
     def __init__(self, cells_per_block=(128, 128, 128), gpu_direct_comm=False, layout="fzyx",
                  warmup_iterations=10, iterations=100, min_iterations=10, max_iterations=100,
-                 time_for_benchmark=1, outer_iterations=1):
+                 time_for_benchmark=1, outer_iterations=1, packinfo="MemcpyPackInfo"):
         self.cells_per_block = cells_per_block
         self.blocks_per_process = 1
 
@@ -22,6 +23,8 @@ class Scenario:
         self.time_for_benchmark = time_for_benchmark
         self.outer_iterations = outer_iterations
 
+        self.packinfo = packinfo
+
     @wlb.member_callback
     def config(self, print_dict=True):
         from pprint import pformat
@@ -42,6 +45,10 @@ class Scenario:
                 'maxIterations': self.max_iterations,
                 'timeForBenchmark': self.time_for_benchmark,
                 'outerIterations': self.outer_iterations,
+                'pachinfo': self.packinfo
+            },
+            'Logging': {
+                'logLevel': LOGLEVEL
             }
         }
 
@@ -66,15 +73,18 @@ def single_run():
     time_for_benchmark = 1
     outer_iterations = 1
 
-    scenarios.add(Scenario(cells_per_block=cells_per_block,
-                           gpu_direct_comm=gpu_direct_comm,
-                           layout=layout,
-                           warmup_iterations=warmup_iterations,
-                           iterations=iterations,
-                           min_iterations=min_iterations,
-                           max_iterations=max_iterations,
-                           time_for_benchmark=time_for_benchmark,
-                           outer_iterations=outer_iterations))
+    for pack_info_name in ["MemcpyPackInfo", "UniformGeneratedGPUPdfPackInfo"]:
+
+        scenarios.add(Scenario(cells_per_block=cells_per_block,
+                               gpu_direct_comm=gpu_direct_comm,
+                               layout=layout,
+                               warmup_iterations=warmup_iterations,
+                               iterations=iterations,
+                               min_iterations=min_iterations,
+                               max_iterations=max_iterations,
+                               time_for_benchmark=time_for_benchmark,
+                               outer_iterations=outer_iterations,
+                               packinfo=pack_info_name))
 
 
 wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
diff --git a/apps/benchmarks/CommunicationGPU/CommunicationGPUCodeGen.py b/apps/benchmarks/CommunicationGPU/CommunicationGPUCodeGen.py
new file mode 100644
index 0000000000000000000000000000000000000000..09ec10fde6415c27428a44237d5240d93aa2c4ab
--- /dev/null
+++ b/apps/benchmarks/CommunicationGPU/CommunicationGPUCodeGen.py
@@ -0,0 +1,23 @@
+from pystencils import Target
+from lbmpy import Stencil, LBStencil, LBMConfig, create_lb_method
+
+from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
+from lbmpy_walberla import generate_lbm_storage_specification
+
+
+with CodeGeneration() as ctx:
+
+    stencil = LBStencil(Stencil.D3Q27)
+    streaming_pattern = 'pull'
+    nonuniform = False
+    target = Target.GPU
+    data_type = "float64"
+    cpu_openmp = False
+
+    lbm_config = LBMConfig(stencil=stencil)
+    method = create_lb_method(lbm_config=lbm_config)
+
+    generate_lbm_storage_specification(ctx, "StorageSpec", method, lbm_config,
+                                       nonuniform=nonuniform, target=target, data_type=data_type)
+
+    generate_info_header(ctx, 'CommunicationGPUInfoHeader')
diff --git a/apps/benchmarks/CommunicationGPU/GPUPackPerformance.cpp b/apps/benchmarks/CommunicationGPU/GPUPackPerformance.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1f7d1ab2ed8e67ab3a49aec0f56451c08592c248
--- /dev/null
+++ b/apps/benchmarks/CommunicationGPU/GPUPackPerformance.cpp
@@ -0,0 +1,315 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file CommunicationGPU.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+#include "core/math/all.h"
+#include "core/mpi/all.h"
+
+#include "gpu/DeviceSelectMPI.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUWrapper.h"
+#include "gpu/HostFieldAllocator.h"
+#include "gpu/ParallelStreams.h"
+#include "gpu/communication/UniformGPUScheme.h"
+#include "gpu/communication/MemcpyPackInfo.h"
+
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/gpu/AddToStorage.h"
+#include "lbm_generated/gpu/GPUPdfField.h"
+#include "lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h"
+
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/PythonCallback.h"
+
+#include "sqlite/SQLite.h"
+
+#include "CommunicationGPUInfoHeader.h"
+
+#include <cmath>
+
+using namespace walberla;
+using gpu::communication::UniformGPUScheme;
+using StorageSpecification_T = lbm::StorageSpec;
+using Stencil_T = StorageSpecification_T::Stencil;
+
+using Field_T = lbm_generated::PdfField< StorageSpecification_T >;
+using GPUField_T = lbm_generated::GPUPdfField< StorageSpecification_T >;
+
+
+int main(int argc, char **argv) {
+    mpi::Environment const env(argc, argv);
+    gpu::selectDeviceBasedOnMpiRank();
+
+    auto mpiManager = mpi::MPIManager::instance();
+    for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg) {
+        if (mpiManager->isMPIInitialized())
+            mpiManager->resetMPI();
+
+        WALBERLA_MPI_WORLD_BARRIER()
+        WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        ///                                        SETUP AND CONFIGURATION                                             ///
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+        auto config = *cfg;
+        logging::configureLogging(config);
+
+        // Reading parameters
+        auto parameters = config->getOneBlock("Parameters");
+        const std::string databaseFile = parameters.getParameter<std::string>("databaseFile",
+                                                                              "CommunicationGPU.sqlite");
+        auto gpuDirectComm = parameters.getParameter<bool>("gpuDirectComm", false);
+        const std::string layoutStr = parameters.getParameter<std::string>("layout", "fzyx");
+
+        auto runCfg = config->getOneBlock("Run");
+        const std::string packInfoName = runCfg.getParameter<std::string>("pachinfo", "UniformGeneratedGPUPdfPackInfo");
+//        const uint_t warmupIterations = runCfg.getParameter<uint_t>("warmupIterations", 2);
+//        uint_t iterations = runCfg.getParameter<uint_t>("iterations", 10);
+//        const uint_t minIterations = runCfg.getParameter<uint_t>("minIterations", 2);
+//        const uint_t maxIterations = runCfg.getParameter<uint_t>("maxIterations", 100);
+//
+//        const real_t timeForBenchmark = runCfg.getParameter<real_t>("timeForBenchmark", real_t(-1.0));
+//        const uint_t outerIterations = runCfg.getParameter<uint_t>("outerIterations", 2);
+
+        field::Layout layout;
+        if (layoutStr == "fzyx")
+            layout = field::fzyx;
+        else if (layoutStr == "zyxf")
+            layout = field::zyxf;
+        else {
+            WALBERLA_ABORT_NO_DEBUG_INFO("Unknown layout string " << layoutStr << ". Valid values are fzyx and zyxf.")
+        }
+
+
+        auto domainCfg = config->getOneBlock("Domain");
+        const Vector3<uint_t> cellsPerBlock = domainCfg.getParameter<Vector3<uint_t> >("cellsPerBlock");
+        uint_t const blocksPerProcess = domainCfg.getParameter<uint_t>("blocksPerProcess", 1);
+
+        auto numProcesses = mpiManager->numProcesses();
+        auto processes = math::getFactors3D(uint_c(numProcesses));
+        auto blockDecomposition = math::getFactors3D(uint_c(numProcesses) * blocksPerProcess);
+        auto aabb = AABB(real_t(0), real_t(0), real_t(0),
+                         real_c(cellsPerBlock[0] * processes[0] * blocksPerProcess),
+                         real_c(cellsPerBlock[1] * processes[1] * blocksPerProcess),
+                         real_c(cellsPerBlock[2] * processes[2] * blocksPerProcess));
+
+        auto blocks = blockforest::createUniformBlockGrid(aabb,
+                                                          blockDecomposition[0], blockDecomposition[1],
+                                                          blockDecomposition[2],
+                                                          cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2],
+                                                          processes[0], processes[1], processes[2],
+                                                          true, true, true, //periodicity
+                                                          false // keepGlobalBlockInformation
+        );
+
+        const StorageSpecification_T StorageSpec = StorageSpecification_T();
+        auto allocator = make_shared<gpu::HostFieldAllocator<real_t> >(); // use pinned memory allocator for faster CPU-GPU memory transfers
+        const BlockDataID fieldCPU  = lbm_generated::addPdfFieldToStorage(blocks, "field", StorageSpec, uint_c(1), layout, allocator);
+        const BlockDataID fieldGPU = lbm_generated::addGPUPdfFieldToStorage< Field_T >(blocks, fieldCPU, StorageSpec, "field GPU", true);
+
+        gpu::fieldCpy<GPUField_T, Field_T>(blocks, fieldGPU, fieldCPU);
+
+
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        ///                                           COMMUNICATION SCHEME                                             ///
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+        std::shared_ptr<gpu::GeneratedGPUPackInfo> packInfo;
+        if(packInfoName == "MemcpyPackInfo")
+        {
+            packInfo = std::make_shared<gpu::communication::MemcpyPackInfo<GPUField_T >>(fieldGPU);
+        }
+        else if (packInfoName == "UniformGeneratedGPUPdfPackInfo")
+        {
+            packInfo = std::make_shared<lbm_generated::UniformGeneratedGPUPdfPackInfo< GPUField_T >>(fieldGPU);
+        }
+        else
+        {
+            WALBERLA_ABORT("packInfoName does not exist")
+        }
+        // WcTimer timer;
+
+        uint_t packSize = 0;
+
+        for (auto &block : *blocks) {
+            for (auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir)
+            {
+                packSize += packInfo->size(*dir, &block);
+                WALBERLA_LOG_INFO_ON_ROOT("For direction: " << stencil::dirToString[*dir] << " " << packInfo->size(*dir, &block)
+                                          << " bytes will be communicated")
+            }
+        }
+
+        unsigned char *buffer;
+        WALBERLA_GPU_CHECK( gpuMalloc( &buffer, packSize))
+
+        unsigned char *beginPtr = buffer;
+        unsigned char *currentPtr = buffer;
+
+
+        for (auto &iBlock : *blocks)
+        {
+            for (auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir)
+            {
+                auto senderBlock = dynamic_cast< Block * >( &iBlock );
+                packInfo->pack(*dir, currentPtr, senderBlock, nullptr);
+                currentPtr += packInfo->size(*dir, senderBlock);
+            }
+        }
+        WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+
+        WALBERLA_CHECK_NOT_NULLPTR(currentPtr)
+        WALBERLA_CHECK_NOT_NULLPTR(beginPtr)
+
+        currentPtr = beginPtr;
+
+        for (auto &iBlock : *blocks)
+        {
+            for (auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir)
+            {
+                auto senderBlock = dynamic_cast< Block * >( &iBlock );
+                packInfo->unpack(*dir, currentPtr, senderBlock, nullptr);
+                currentPtr += packInfo->size(*dir, senderBlock);
+            }
+        }
+        WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+
+
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        ///                                          TIME STEP DEFINITIONS                                             ///
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//        commStart();
+//        commWait();
+//
+//        WcTimer warmupTimer;
+//        warmupTimer.start();
+//        for (uint_t warmupCounter = 0; warmupCounter < warmupIterations; ++warmupCounter) {
+//            commStart();
+//            commWait();
+//            WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+//        }
+//        warmupTimer.end();
+//
+//
+//        auto estimatedTimePerIteration = warmupTimer.last() / real_c(warmupIterations);
+//        if (timeForBenchmark > 0) {
+//            iterations = uint_c(timeForBenchmark / estimatedTimePerIteration);
+//            if (iterations < minIterations)
+//                iterations = minIterations;
+//            if (iterations > maxIterations)
+//                iterations = maxIterations;
+//        }
+//        WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+//
+//        mpi::broadcastObject(iterations);
+//
+//        auto timingPool = std::make_shared<WcTimingPool>();
+//        communication.enableTiming(timingPool);
+//        WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+//        WALBERLA_LOG_INFO_ON_ROOT("Running " << outerIterations << " outer iterations of size " << iterations)
+//
+//        for (uint_t outerCtr = 0; outerCtr < outerIterations; ++outerCtr) {
+//            for (uint_t ctr = 0; ctr < iterations; ++ctr) {
+//                timingPool->operator[]("commStart").start();
+//                commStart();
+//                WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+//                timingPool->operator[]("commStart").end();
+//
+//                timingPool->operator[]("commWait").start();
+//                commWait();
+//                WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+//                timingPool->operator[]("commWait").end();
+//            }
+//        }
+//
+//        WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+//
+//        auto reducedTimingPool = timingPool->getReduced(timing::REDUCE_TOTAL, 0);
+//        WALBERLA_ROOT_SECTION() {
+//            WALBERLA_LOG_RESULT(*reducedTimingPool)
+//
+//            std::map<std::string, walberla::int64_t> integerProperties;
+//            std::map<std::string, double> realProperties;
+//            std::map<std::string, std::string> stringProperties;
+//
+//            auto databaseBlock = config->getBlock("Database");
+//            if (databaseBlock) {
+//                for (auto it = databaseBlock.begin(); it != databaseBlock.end(); ++it)
+//                    stringProperties[it->first] = it->second;
+//            }
+//
+//            realProperties["total_min"] = real_c(timingPool->operator[]("totalTime").min()) / real_c(iterations);
+//            realProperties["total_max"] = real_c(timingPool->operator[]("totalTime").max() / real_c(iterations));
+//
+//            integerProperties["cellsPerBlock0"] = int64_c(cellsPerBlock[0]);
+//            integerProperties["cellsPerBlock1"] = int64_c(cellsPerBlock[1]);
+//            integerProperties["cellsPerBlock2"] = int64_c(cellsPerBlock[2]);
+//
+//            integerProperties["processes0"] = int64_c(processes[0]);
+//            integerProperties["processes1"] = int64_c(processes[1]);
+//            integerProperties["processes2"] = int64_c(processes[2]);
+//
+//            integerProperties["blocks0"] = int64_c(blockDecomposition[0]);
+//            integerProperties["blocks1"] = int64_c(blockDecomposition[1]);
+//            integerProperties["blocks2"] = int64_c(blockDecomposition[2]);
+//
+//            integerProperties["blocksPerProcess"] = int64_c(blocksPerProcess);
+//            integerProperties["cartesianCommunicator"] = mpiManager->hasCartesianSetup();
+//
+//            integerProperties["warmupIterations"] = int64_c(warmupIterations);
+//            integerProperties["iterations"] = int64_c(iterations);
+//            integerProperties["outerIterations"] = int64_c(outerIterations);
+//
+//            stringProperties["layout"] = layoutStr;
+//
+//            stringProperties["SLURM_CLUSTER_NAME"] = fromEnv("SLURM_CLUSTER_NAME");
+//            stringProperties["SLURM_CPUS_ON_NODE"] = fromEnv("SLURM_CPUS_ON_NODE");
+//            stringProperties["SLURM_CPUS_PER_TASK"] = fromEnv("SLURM_CPUS_PER_TASK");
+//            stringProperties["SLURM_JOB_ACCOUNT"] = fromEnv("SLURM_JOB_ACCOUNT");
+//            stringProperties["SLURM_JOB_ID"] = fromEnv("SLURM_JOB_ID");
+//            stringProperties["SLURM_JOB_CPUS_PER_NODE"] = fromEnv("SLURM_JOB_CPUS_PER_NODE");
+//            stringProperties["SLURM_JOB_NAME"] = fromEnv("SLURM_JOB_NAME");
+//            stringProperties["SLURM_JOB_NUM_NODES"] = fromEnv("SLURM_JOB_NUM_NODES");
+//            stringProperties["SLURM_NTASKS"] = fromEnv("SLURM_NTASKS");
+//            stringProperties["SLURM_NTASKS_PER_CORE"] = fromEnv("SLURM_NTASKS_PER_CORE");
+//            stringProperties["SLURM_NTASKS_PER_NODE"] = fromEnv("SLURM_NTASKS_PER_NODE");
+//            stringProperties["SLURM_NTASKS_PER_SOCKET"] = fromEnv("SLURM_NTASKS_PER_SOCKET");
+//            stringProperties["SLURM_TASKS_PER_NODE"] = fromEnv("SLURM_TASKS_PER_NODE");
+//
+//            stringProperties["buildMachine"] = std::string(WALBERLA_BUILD_MACHINE);
+//            stringProperties["gitVersion"] = std::string(WALBERLA_GIT_SHA1);
+//            stringProperties["buildType"] = std::string(WALBERLA_BUILD_TYPE);
+//            stringProperties["compilerFlags"] = std::string(WALBERLA_COMPILER_FLAGS);
+//
+//            auto runId = sqlite::storeRunInSqliteDB(databaseFile, integerProperties, stringProperties, realProperties);
+//            sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, *timingPool, "TimingRoot");
+//            sqlite::storeTimingPoolInSqliteDB(databaseFile, runId, *reducedTimingPool, "TimingReduced");
+//        }
+    }
+    return EXIT_SUCCESS;
+}
\ No newline at end of file
diff --git a/apps/benchmarks/CommunicationGPU/GPUPackPerformance.py b/apps/benchmarks/CommunicationGPU/GPUPackPerformance.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0cd8969395eacaee42156b326682eb8b191b665
--- /dev/null
+++ b/apps/benchmarks/CommunicationGPU/GPUPackPerformance.py
@@ -0,0 +1,90 @@
+import os
+import waLBerla as wlb
+
+DB_FILE = os.environ.get('DB_FILE', "CommunicationGPU.sqlite3")
+LOGLEVEL = 'info'
+
+
+class Scenario:
+    def __init__(self, cells_per_block=(128, 128, 128), gpu_direct_comm=False, layout="fzyx",
+                 warmup_iterations=10, iterations=100, min_iterations=10, max_iterations=100,
+                 time_for_benchmark=1, outer_iterations=1, packinfo="MemcpyPackInfo"):
+        self.cells_per_block = cells_per_block
+        self.blocks_per_process = 1
+
+        self.database_file = DB_FILE
+        self.gpu_direct_comm = gpu_direct_comm
+        self.layout = layout
+
+        self.warmup_iterations = warmup_iterations
+        self.iterations = iterations
+        self.min_iterations = min_iterations
+        self.max_iterations = max_iterations
+        self.time_for_benchmark = time_for_benchmark
+        self.outer_iterations = outer_iterations
+
+        self.packinfo = packinfo
+
+    @wlb.member_callback
+    def config(self, print_dict=True):
+        from pprint import pformat
+        config_dict = {
+            'Domain': {
+                'cellsPerBlock': self.cells_per_block,
+                'blocksPerProcess': self.blocks_per_process,
+            },
+            'Parameters': {
+                'databaseFile': self.database_file,
+                'gpuDirectComm': self.gpu_direct_comm,
+                'layout': self.layout,
+            },
+            'Run': {
+                'warmupIterations': self.warmup_iterations,
+                'iterations': self.iterations,
+                'minIterations': self.min_iterations,
+                'maxIterations': self.max_iterations,
+                'timeForBenchmark': self.time_for_benchmark,
+                'outerIterations': self.outer_iterations,
+                'pachinfo': self.packinfo
+            },
+            'Logging': {
+                'logLevel': LOGLEVEL
+            }
+        }
+
+        if print_dict:
+            wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
+        return config_dict
+
+
+# -------------------------------------- Profiling -----------------------------------
+def single_run():
+    """Tests different communication overlapping strategies"""
+
+    scenarios = wlb.ScenarioManager()
+
+    cells_per_block = (128, 128, 128)
+    gpu_direct_comm = False
+    layout = "fzyx"
+    warmup_iterations = 10
+    iterations = 100
+    min_iterations = 10
+    max_iterations = 100
+    time_for_benchmark = 1
+    outer_iterations = 1
+
+    for pack_info_name in ["MemcpyPackInfo", "UniformGeneratedGPUPdfPackInfo"]:
+        scenarios.add(Scenario(cells_per_block=cells_per_block,
+                               gpu_direct_comm=gpu_direct_comm,
+                               layout=layout,
+                               warmup_iterations=warmup_iterations,
+                               iterations=iterations,
+                               min_iterations=min_iterations,
+                               max_iterations=max_iterations,
+                               time_for_benchmark=time_for_benchmark,
+                               outer_iterations=outer_iterations,
+                               packinfo=pack_info_name))
+
+
+wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
+single_run()
diff --git a/python/lbmpy_walberla/storage_specification.py b/python/lbmpy_walberla/storage_specification.py
index c113604381be85e3895e3744285a528d4786f84e..01287a764809c96c43de08eb357c9336f5a606d8 100644
--- a/python/lbmpy_walberla/storage_specification.py
+++ b/python/lbmpy_walberla/storage_specification.py
@@ -39,9 +39,6 @@ def generate_lbm_storage_specification(generation_context, class_name: str,
     if nonuniform:
         kernels = cg.create_nonuniform_kernel_families(kernels_dict=kernels)
 
-    values_per_cell = len(stencil)
-    dimension = len(stencil[0])
-
     # Pure storage specification
     if not stencil_name:
         raise ValueError("lb_method uses a stencil that is not supported in waLBerla")
diff --git a/src/core/mpi/BufferSystemHelper.impl.h b/src/core/mpi/BufferSystemHelper.impl.h
index e6236f100ca3d8f58817824c16ac5844e41f4af6..52236a7ec4cc03ef2a47966ed699be67a625016d 100644
--- a/src/core/mpi/BufferSystemHelper.impl.h
+++ b/src/core/mpi/BufferSystemHelper.impl.h
@@ -147,7 +147,7 @@ MPIRank KnownSizeCommunication<Rb, Sb>::waitForNextReceive( std::map<MPIRank, Re
 
    recvRequests_[ uint_c( requestIndex ) ] = MPI_REQUEST_NULL;
 
-   MPIRank senderRank = status.MPI_SOURCE;
+   MPIRank const senderRank = status.MPI_SOURCE;
    WALBERLA_ASSERT_GREATER_EQUAL( senderRank, 0 );
 
 
diff --git a/src/gpu/communication/MemcpyPackInfo.h b/src/gpu/communication/MemcpyPackInfo.h
index 6c15988f4f2687275fea7f0f8be36b2e7d99fcf6..7a36f0e576af29dc3b93e96b769e17d3377779bc 100644
--- a/src/gpu/communication/MemcpyPackInfo.h
+++ b/src/gpu/communication/MemcpyPackInfo.h
@@ -17,7 +17,7 @@ template<typename GPUFieldType>
 class MemcpyPackInfo : public ::walberla::gpu::GeneratedGPUPackInfo
 {
 public:
-    MemcpyPackInfo( BlockDataID pdfsID_ ) : pdfsID(pdfsID_) {};
+    MemcpyPackInfo( BlockDataID fieldID_ ) : fieldID(fieldID_) {};
     ~MemcpyPackInfo() override = default;
 
     void pack  (stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream) override;
@@ -26,7 +26,7 @@ public:
     uint_t size(stencil::Direction dir, IBlock * block) override;
 
 private:
-    BlockDataID pdfsID;
+    BlockDataID fieldID;
     uint_t numberOfGhostLayers_{0};
     bool communicateAllGhostLayers_{true};
 
diff --git a/src/gpu/communication/MemcpyPackInfo.impl.h b/src/gpu/communication/MemcpyPackInfo.impl.h
index 2110933cda5322828f40cc14b471be5c6a309bfe..843166a55a3ab24db491c9668e85720e51c0aea3 100644
--- a/src/gpu/communication/MemcpyPackInfo.impl.h
+++ b/src/gpu/communication/MemcpyPackInfo.impl.h
@@ -19,7 +19,7 @@ void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char
                                           IBlock * block, gpuStream_t stream)
 {
    // Extract field data pointer from the block
-   const GPUFieldType * fieldPtr = block->getData< GPUFieldType >( pdfsID );
+   const GPUFieldType * fieldPtr = block->getData< GPUFieldType >( fieldID );
    WALBERLA_ASSERT_NOT_NULLPTR( fieldPtr )
    // 
    cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) );
@@ -68,12 +68,9 @@ void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char
 template<typename GPUFieldType>
 void MemcpyPackInfo< GPUFieldType >::communicateLocal( stencil::Direction dir, const IBlock* sender, IBlock* receiver, gpuStream_t stream )
 {
-   // WALBERLA_ABORT("The MemcpyPackInfo does not provide a thread safe local communication. Thus is can not be used in local mode. To use it set local useLocalCommunication to false in the communication scheme")
-
-
    // Extract field data pointer from the block
-   const GPUFieldType * senderFieldPtr = sender->getData< GPUFieldType >( pdfsID );
-   const GPUFieldType * receiverFieldPtr = receiver->getData< GPUFieldType >( pdfsID );
+   const GPUFieldType * senderFieldPtr = sender->getData< GPUFieldType >( fieldID );
+   const GPUFieldType * receiverFieldPtr = receiver->getData< GPUFieldType >( fieldID );
    WALBERLA_ASSERT_NOT_NULLPTR( senderFieldPtr )
    WALBERLA_ASSERT_NOT_NULLPTR( receiverFieldPtr )
 
@@ -128,7 +125,7 @@ template<typename GPUFieldType>
 void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned char * byte_buffer,
                                             IBlock * block, gpuStream_t stream)
 {
-   GPUFieldType * fieldPtr = block->getData< GPUFieldType >( pdfsID );
+   GPUFieldType * fieldPtr = block->getData< GPUFieldType >( fieldID );
    WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr)
 
    cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) );
@@ -173,100 +170,13 @@ void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned cha
 template<typename GPUFieldType>
 uint_t MemcpyPackInfo< GPUFieldType >::size(stencil::Direction dir, IBlock * block)
 {
-    auto pdfs = block->getData< GPUFieldType >(pdfsID);
+    auto field = block->getData< GPUFieldType >(fieldID);
 
     CellInterval ci;
-    cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( pdfs ) );
-    pdfs->getGhostRegion(dir, ci, nrOfGhostLayers, false);
-
-    /*
-    uint_t elementsPerCell = 0;
-
-    switch( dir )
-    {
-        case stencil::SW:
-            elementsPerCell = 1;
-            break;
-        
-        case stencil::S:
-            elementsPerCell = 5;
-            break;
-        
-        case stencil::W:
-            elementsPerCell = 5;
-            break;
-        
-        case stencil::B:
-            elementsPerCell = 5;
-            break;
-        
-        case stencil::T:
-            elementsPerCell = 5;
-            break;
-        
-        case stencil::BN:
-            elementsPerCell = 1;
-            break;
-        
-        case stencil::N:
-            elementsPerCell = 5;
-            break;
-        
-        case stencil::TE:
-            elementsPerCell = 1;
-            break;
-        
-        case stencil::E:
-            elementsPerCell = 5;
-            break;
-        
-        case stencil::BE:
-            elementsPerCell = 1;
-            break;
-        
-        case stencil::SE:
-            elementsPerCell = 1;
-            break;
-        
-        case stencil::C:
-            elementsPerCell = 1;
-            break;
-        
-        case stencil::TN:
-            elementsPerCell = 1;
-            break;
-        
-        case stencil::TS:
-            elementsPerCell = 1;
-            break;
-        
-        case stencil::NE:
-            elementsPerCell = 1;
-            break;
-        
-        case stencil::BW:
-            elementsPerCell = 1;
-            break;
-        
-        case stencil::NW:
-            elementsPerCell = 1;
-            break;
-        
-        case stencil::BS:
-            elementsPerCell = 1;
-            break;
-        
-        case stencil::TW:
-            elementsPerCell = 1;
-            break;
-        
-        default:
-            elementsPerCell = 0;
-    }
-
-    return ci.numCells() * elementsPerCell * sizeof(typename GPUFieldType::value_type);
-    */
-    uint_t totalCells = ci.numCells() * pdfs->fSize() * sizeof(typename GPUFieldType::value_type);
+    cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( field ) );
+    field->getGhostRegion(dir, ci, nrOfGhostLayers, false);
+
+    uint_t totalCells = ci.numCells() * field->fSize() * sizeof(typename GPUFieldType::value_type);
     return totalCells;
 }
 
diff --git a/src/gpu/communication/UniformGPUScheme.h b/src/gpu/communication/UniformGPUScheme.h
index 5c9604ccd8cc00e5cdb2d9f9c1085ace2f2e44a5..9057bf7b196d9b64fe911d1bcde0651fd361c5da 100644
--- a/src/gpu/communication/UniformGPUScheme.h
+++ b/src/gpu/communication/UniformGPUScheme.h
@@ -18,7 +18,6 @@
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
-
 #pragma once
 
 #include "blockforest/StructuredBlockForest.h"
@@ -30,48 +29,52 @@
 
 #include "stencil/Directions.h"
 
-#include <thread>
-
-#include "gpu/GPURAII.h"
 #include "gpu/GPUWrapper.h"
-#include "gpu/ParallelStreams.h"
 #include "gpu/communication/CustomMemoryBuffer.h"
 #include "gpu/communication/GeneratedGPUPackInfo.h"
 
+#include <thread>
+
 namespace walberla {
 namespace gpu
 {
 namespace communication {
 
-
-
    template<typename Stencil>
    class UniformGPUScheme
    {
    public:
-       explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf,
+       explicit UniformGPUScheme( const weak_ptr<StructuredBlockForest>& bf,
                                   bool sendDirectlyFromGPU = false,
                                   bool useLocalCommunication = true,
                                   const int tag = 5432 );
 
-       explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf,
+       explicit UniformGPUScheme( const weak_ptr<StructuredBlockForest>& bf,
                                  const Set<SUID> & requiredBlockSelectors,
                                  const Set<SUID> & incompatibleBlockSelectors,
                                  bool sendDirectlyFromGPU = false,
                                  bool useLocalCommunication = true,
                                  const int tag = 5432 );
 
+       ~UniformGPUScheme();
+
        void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi );
 
-       void startCommunication( gpuStream_t stream = nullptr);
-       void wait( gpuStream_t stream = nullptr);
+       void startCommunication();
+       void wait();
 
        void operator()( gpuStream_t stream = nullptr )         { communicate( stream ); }
        inline void communicate( gpuStream_t stream = nullptr ) { startCommunication(stream); wait(stream); }
 
-       std::function<void()> getCommunicateFunctor( gpuStream_t stream = nullptr );
-       std::function<void()> getStartCommunicateFunctor( gpuStream_t stream = nullptr );
-       std::function<void()> getWaitFunctor( gpuStream_t stream = nullptr );
+       std::function<void()> getCommunicateFunctor();
+       std::function<void()> getStartCommunicateFunctor();
+       std::function<void()> getWaitFunctor();
+
+       void enableTiming( const shared_ptr<WcTimingPool> & timingPool )
+       {
+           timing_ = true;
+           timingPool_ = timingPool;
+       }
 
    private:
        void setupCommunication();
@@ -92,8 +95,6 @@ namespace communication {
 
        std::vector<shared_ptr<GeneratedGPUPackInfo> > packInfos_;
 
-       ParallelStreams parallelSectionManager_;
-
        struct Header
        {
            BlockID blockId;
@@ -103,6 +104,11 @@ namespace communication {
 
        Set<SUID> requiredBlockSelectors_;
        Set<SUID> incompatibleBlockSelectors_;
+
+       bool timing_{false};
+       shared_ptr<WcTimingPool> timingPool_;
+
+       gpuStream_t streams_[Stencil::Q];
    };
 
 
diff --git a/src/gpu/communication/UniformGPUScheme.impl.h b/src/gpu/communication/UniformGPUScheme.impl.h
index 28033d1464f76709178042a5c87a27485ceacb44..3c885842f3a5c9f9a9eb50e6b10fbf3f1c85e466 100644
--- a/src/gpu/communication/UniformGPUScheme.impl.h
+++ b/src/gpu/communication/UniformGPUScheme.impl.h
@@ -18,9 +18,6 @@
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
-
-#include "gpu/ParallelStreams.h"
-
 namespace walberla {
 namespace gpu
 {
@@ -28,7 +25,7 @@ namespace communication {
 
 
    template<typename Stencil>
-   UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf,
+   UniformGPUScheme<Stencil>::UniformGPUScheme( const weak_ptr <StructuredBlockForest>& bf,
                                                 bool sendDirectlyFromGPU,
                                                 bool useLocalCommunication,
                                                 const int tag )
@@ -39,7 +36,6 @@ namespace communication {
           useLocalCommunication_(useLocalCommunication),
           bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ),
           bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ),
-          parallelSectionManager_( -1 ),
           requiredBlockSelectors_( Set<SUID>::emptySet() ),
           incompatibleBlockSelectors_( Set<SUID>::emptySet() )
    {
@@ -49,10 +45,15 @@ namespace communication {
          WALBERLA_CHECK(!sendDirectlyFromGPU)
 #endif
       }
+
+       for (uint_t i = 0; i < Stencil::Q; ++i)
+       {
+           WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i]))
+       }
    }
 
    template<typename Stencil>
-   UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf,
+   UniformGPUScheme<Stencil>::UniformGPUScheme( const weak_ptr <StructuredBlockForest>& bf,
                                                 const Set<SUID> & requiredBlockSelectors,
                                                 const Set<SUID> & incompatibleBlockSelectors,
                                                 bool sendDirectlyFromGPU,
@@ -65,7 +66,6 @@ namespace communication {
         useLocalCommunication_(useLocalCommunication),
         bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ),
         bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ),
-        parallelSectionManager_( -1 ),
         requiredBlockSelectors_( requiredBlockSelectors ),
         incompatibleBlockSelectors_( incompatibleBlockSelectors )
    {
@@ -75,11 +75,25 @@ namespace communication {
          WALBERLA_CHECK(!sendDirectlyFromGPU)
 #endif
       }
+
+       for (uint_t i = 0; i < Stencil::Q; ++i)
+       {
+           WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i]))
+       }
    }
 
+    template< typename Stencil >
+    UniformGPUScheme< Stencil >::~UniformGPUScheme()
+    {
+        for (uint_t i = 0; i < Stencil::Q; ++i)
+        {
+            WALBERLA_GPU_CHECK(gpuStreamDestroy(streams_[i]))
+        }
+    }
+
 
    template<typename Stencil>
-   void UniformGPUScheme<Stencil>::startCommunication( gpuStream_t stream )
+   void UniformGPUScheme<Stencil>::startCommunication()
    {
       WALBERLA_ASSERT( !communicationInProgress_ )
       auto forest = blockForest_.lock();
@@ -99,9 +113,11 @@ namespace communication {
          for( auto it : headers_ )
             bufferSystemGPU_.sendBuffer( it.first ).clear();
 
+      if(timing_)
+          timingPool_->operator[]("UniformGPUScheme->startCommunication").start();
+
       // Start filling send buffers
       {
-         auto parallelSection = parallelSectionManager_.parallelSection( stream );
          for( auto &iBlock : *forest )
          {
             auto senderBlock = dynamic_cast< Block * >( &iBlock );
@@ -124,7 +140,7 @@ namespace communication {
                   auto receiverBlock = dynamic_cast< Block * >( forest->getBlock( senderBlock->getNeighborId( neighborIdx, uint_t(0) )) );
                   for (auto& pi : packInfos_)
                   {
-                     pi->communicateLocal(*dir, senderBlock, receiverBlock, stream);
+                     pi->communicateLocal(*dir, senderBlock, receiverBlock, streams_[*dir]);
                   }
                }
                else
@@ -133,26 +149,29 @@ namespace communication {
 
                   for( auto &pi : packInfos_ )
                   {
-                     parallelSection.run([&](auto s) {
                      auto size = pi->size( *dir, senderBlock );
                      auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size );
                      WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
-                     pi->pack( *dir, gpuDataPtr, senderBlock, s );
+                     pi->pack( *dir, gpuDataPtr, senderBlock, streams_[*dir] );
 
                      if( !sendFromGPU_ )
                      {
                         auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size );
                         WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr )
-                        WALBERLA_GPU_CHECK( gpuMemcpyAsync( cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, s ))
+                        WALBERLA_GPU_CHECK( gpuMemcpyAsync( cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[*dir] ))
                      }
-                     });
                   }
                }
             }
          }
       }
       // wait for packing to finish
-      WALBERLA_GPU_CHECK( gpuStreamSynchronize( stream ) );
+       for (uint_t i = 0; i < Stencil::Q; ++i)
+       {
+           WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
+       }
+       if(timing_)
+           timingPool_->operator[]("UniformGPUScheme->startCommunication").end();
 
       if( sendFromGPU_ )
          bufferSystemGPU_.sendAll();
@@ -164,15 +183,17 @@ namespace communication {
 
 
    template<typename Stencil>
-   void UniformGPUScheme<Stencil>::wait( gpuStream_t stream )
+   void UniformGPUScheme<Stencil>::wait()
    {
       WALBERLA_ASSERT( communicationInProgress_ )
 
       auto forest = blockForest_.lock();
 
+       if(timing_)
+           timingPool_->operator[]("UniformGPUScheme->wait").start();
+
       if( sendFromGPU_ )
       {
-         auto parallelSection = parallelSectionManager_.parallelSection( stream );
          for( auto recvInfo = bufferSystemGPU_.begin(); recvInfo != bufferSystemGPU_.end(); ++recvInfo )
          {
             recvInfo.buffer().clear();
@@ -185,16 +206,13 @@ namespace communication {
                   auto size = pi->size( header.dir, block );
                   auto gpuDataPtr = recvInfo.buffer().advanceNoResize( size );
                   WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
-                  parallelSection.run([&](auto s) {
-                     pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s );
-                  });
+                  pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, streams_[stencil::inverseDir[header.dir]] );
                }
             }
          }
       }
       else
       {
-         auto parallelSection = parallelSectionManager_.parallelSection( stream );
          for( auto recvInfo = bufferSystemCPU_.begin(); recvInfo != bufferSystemCPU_.end(); ++recvInfo )
          {
             auto &gpuBuffer = bufferSystemGPU_.sendBuffer( recvInfo.rank());
@@ -211,17 +229,20 @@ namespace communication {
                   auto gpuDataPtr = gpuBuffer.advanceNoResize( size );
                   WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr )
                   WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
-                  parallelSection.run([&](auto s) {
                      WALBERLA_GPU_CHECK( gpuMemcpyAsync( gpuDataPtr, cpuDataPtr, size,
-                                                           gpuMemcpyHostToDevice, s ))
-                     pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s );
-                  });
+                                                           gpuMemcpyHostToDevice, streams_[stencil::inverseDir[header.dir]] ))
+                     pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, streams_[stencil::inverseDir[header.dir]] );
                }
             }
          }
       }
 
-      WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+       for (uint_t i = 0; i < Stencil::Q; ++i)
+       {
+           WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
+       }
+       if(timing_)
+           timingPool_->operator[]("UniformGPUScheme->wait").end();
       communicationInProgress_ = false;
    }
 
@@ -309,21 +330,21 @@ namespace communication {
    }
 
    template< typename Stencil >
-   std::function<void()> UniformGPUScheme<Stencil>::getCommunicateFunctor(gpuStream_t stream)
+   std::function<void()> UniformGPUScheme<Stencil>::getCommunicateFunctor()
    {
-      return [this, stream]() { communicate( stream ); };
+      return [this]() { communicate(); };
    }
 
    template< typename Stencil >
-   std::function<void()> UniformGPUScheme<Stencil>::getStartCommunicateFunctor(gpuStream_t stream)
+   std::function<void()> UniformGPUScheme<Stencil>::getStartCommunicateFunctor()
    {
-      return [this, stream]() { startCommunication( stream ); };
+      return [this]() { startCommunication(); };
    }
 
    template< typename Stencil >
-   std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor(cudaStream_t stream)
+   std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor()
    {
-      return [this, stream]() { wait( stream ); };
+      return [this]() { wait(); };
    }
 
 } // namespace communication