diff --git a/.clang-format b/.clang-format
index e5114ffd062d399b881d114f50e94b45988832dc..4ffb182c0fb8f021a6b1f31aee4b7f1b34608f63 100644
--- a/.clang-format
+++ b/.clang-format
@@ -62,16 +62,18 @@ IncludeCategories:
   - Regex:           '^"core/'
     Priority:        4
   - Regex:           '^"domain_decomposition/'
-    Priority:        6
+    Priority:        5
   - Regex:           '^"executiontree/'
-    Priority:        7
+    Priority:        6
   - Regex:           '^"fft/'
-    Priority:        8
+    Priority:        7
   - Regex:           '^"field/'
-    Priority:        9
+    Priority:        8
   - Regex:           '^"gather/'
-    Priority:        10
+    Priority:        9
   - Regex:           '^"geometry/'
+    Priority:        10
+  - Regex:           '^"gpu/'
     Priority:        11
   - Regex:           '^"gpu/'
     Priority:        12
@@ -97,16 +99,18 @@ IncludeCategories:
     Priority:        21
   - Regex:           '^"simd/'
     Priority:        22
-  - Regex:           '^"stencil/'
+  - Regex:           '^"sqlite/'
     Priority:        23
-  - Regex:           '^"timeloop/'
+  - Regex:           '^"stencil/'
     Priority:        24
-  - Regex:           '^"vtk/'
+  - Regex:           '^"timeloop/'
     Priority:        25
-  - Regex:           '^<boost/'
+  - Regex:           '^"vtk/'
     Priority:        26
-  - Regex:           '^<'
+  - Regex:           '^<boost/'
     Priority:        27
+  - Regex:           '^<'
+    Priority:        28
 IndentCaseLabels: false
 IndentPPDirectives: AfterHash
 IndentWidth: 3
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 47ac8e0266e3ae7ddf305a22c70fb22406304238..1ee46ddc0af7fd8dfd8e859d18ce0bce05e4184d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -57,6 +57,7 @@ stages:
         -DWALBERLA_BUILD_WITH_CODEGEN=$WALBERLA_BUILD_WITH_CODEGEN
         -DWALBERLA_STL_BOUNDS_CHECKS=$WALBERLA_STL_BOUNDS_CHECKS
         -DWALBERLA_LOGLEVEL=$WALBERLA_LOGLEVEL
+        -DCMAKE_CUDA_ARCHITECTURES=60
       - cmake . -LA
       - make -j $NUM_BUILD_CORES -l $NUM_CORES
       - ctest -LE $CTEST_EXCLUDE_LABELS -C $CMAKE_BUILD_TYPE --output-on-failure -j $NUM_CORES -T Test
diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt
index f418d434ca38902dc79be8f9b9a4207f9fb90de6..cffa8f427d9fc06e9bdc96090553374eb3e73ba3 100644
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -5,9 +5,11 @@ add_subdirectory( DEM )
 add_subdirectory( MeshDistance )
 add_subdirectory( CouetteFlow )
 add_subdirectory( FreeSurfaceAdvection )
+add_subdirectory( FluidizedBed )
 add_subdirectory( FluidParticleCoupling )
 add_subdirectory( FluidParticleCouplingWithLoadBalancing )
 add_subdirectory( ForcesOnSphereNearPlaneInShearFlow )
+add_subdirectory(Percolation)
 add_subdirectory( GranularGas )
 add_subdirectory( IntegratorAccuracy )
 add_subdirectory( LennardJones )
diff --git a/apps/benchmarks/FluidParticleCoupling/CMakeLists.txt b/apps/benchmarks/FluidParticleCoupling/CMakeLists.txt
index 898352998666621d6bdfec7f0f07f5c6b4de3724..34ffaca075f6ac90be4f9f077f779ec21f7e6603 100644
--- a/apps/benchmarks/FluidParticleCoupling/CMakeLists.txt
+++ b/apps/benchmarks/FluidParticleCoupling/CMakeLists.txt
@@ -67,7 +67,5 @@ waLBerla_add_executable ( NAME ObliqueWetCollision FILES ObliqueWetCollision.cpp
 
 endif()
 
-
-
 waLBerla_add_executable ( NAME ObliqueDryCollision FILES ObliqueDryCollision.cpp
-      DEPENDS blockforest core mesa_pd postprocessing )
\ No newline at end of file
+      DEPENDS blockforest core mesa_pd postprocessing )
diff --git a/apps/benchmarks/FluidizedBed/CMakeLists.txt b/apps/benchmarks/FluidizedBed/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9c33fb523801a7f55935e99661496e38142b412a
--- /dev/null
+++ b/apps/benchmarks/FluidizedBed/CMakeLists.txt
@@ -0,0 +1,6 @@
+waLBerla_link_files_to_builddir("*.prm")
+
+if (WALBERLA_BUILD_WITH_GPU_SUPPORT AND WALBERLA_BUILD_WITH_CODEGEN AND (CMAKE_CUDA_ARCHITECTURES GREATER_EQUAL 60 OR WALBERLA_BUILD_WITH_HIP))
+    waLBerla_add_executable(NAME FluidizedBed_PSM_GPU FILES FluidizedBedGPU.cpp
+            DEPENDS blockforest boundary core gpu domain_decomposition field lbm lbm_mesapd_coupling mesa_pd timeloop vtk PSMCodegenPython_srt_sc1)
+endif ()
diff --git a/apps/benchmarks/FluidizedBed/FluidizedBedGPU.cpp b/apps/benchmarks/FluidizedBed/FluidizedBedGPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cf454243e5c71ce9b8cde90dd73fa92b47d44bff
--- /dev/null
+++ b/apps/benchmarks/FluidizedBed/FluidizedBedGPU.cpp
@@ -0,0 +1,841 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FluidizedBedGPU.cpp
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//! \brief Modification of showcases/FluidizedBed/FluidizedBedPSM.cpp
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/debug/Debug.h"
+#include "core/grid_generator/SCIterator.h"
+#include "core/logging/all.h"
+#include "core/math/all.h"
+#include "core/mpi/Broadcast.h"
+#include "core/timing/RemainingTimeLogger.h"
+
+#include "field/AddToStorage.h"
+#include "field/vtk/all.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/DeviceSelectMPI.h"
+#include "gpu/communication/UniformGPUScheme.h"
+
+#include "lbm/PerformanceLogger.h"
+#include "lbm/vtk/all.h"
+
+#include "lbm_mesapd_coupling/DataTypesCodegen.h"
+#include "lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMSweepCollection.h"
+#include "lbm_mesapd_coupling/utility/AddForceOnParticlesKernel.h"
+#include "lbm_mesapd_coupling/utility/AddHydrodynamicInteractionKernel.h"
+#include "lbm_mesapd_coupling/utility/AverageHydrodynamicForceTorqueKernel.h"
+#include "lbm_mesapd_coupling/utility/InitializeHydrodynamicForceTorqueForAveragingKernel.h"
+#include "lbm_mesapd_coupling/utility/LubricationCorrectionKernel.h"
+#include "lbm_mesapd_coupling/utility/ParticleSelector.h"
+#include "lbm_mesapd_coupling/utility/ResetHydrodynamicForceTorqueKernel.h"
+
+#include "mesa_pd/collision_detection/AnalyticContactDetection.h"
+#include "mesa_pd/data/DataTypes.h"
+#include "mesa_pd/data/LinkedCells.h"
+#include "mesa_pd/data/ParticleAccessorWithShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/ShapeStorage.h"
+#include "mesa_pd/data/shape/HalfSpace.h"
+#include "mesa_pd/data/shape/Sphere.h"
+#include "mesa_pd/domain/BlockForestDataHandling.h"
+#include "mesa_pd/domain/BlockForestDomain.h"
+#include "mesa_pd/kernel/AssocToBlock.h"
+#include "mesa_pd/kernel/DoubleCast.h"
+#include "mesa_pd/kernel/InsertParticleIntoLinkedCells.h"
+#include "mesa_pd/kernel/LinearSpringDashpot.h"
+#include "mesa_pd/kernel/ParticleSelector.h"
+#include "mesa_pd/kernel/VelocityVerlet.h"
+#include "mesa_pd/mpi/ContactFilter.h"
+#include "mesa_pd/mpi/ReduceContactHistory.h"
+#include "mesa_pd/mpi/ReduceProperty.h"
+#include "mesa_pd/mpi/SyncNextNeighbors.h"
+#include "mesa_pd/mpi/notifications/ForceTorqueNotification.h"
+#include "mesa_pd/mpi/notifications/HydrodynamicForceTorqueNotification.h"
+#include "mesa_pd/vtk/ParticleVtkOutput.h"
+
+#include "vtk/all.h"
+
+#include "InitializeDomainForPSM.h"
+#include "PSMPackInfo.h"
+#include "PSMSweepSplit.h"
+#include "PSM_Density.h"
+#include "PSM_InfoHeader.h"
+#include "PSM_MacroGetter.h"
+#include "PSM_NoSlip.h"
+#include "PSM_UBB.h"
+
+namespace fluidized_bed
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+using walberla::uint_t;
+
+using flag_t      = walberla::uint8_t;
+using FlagField_T = FlagField< flag_t >;
+
+using namespace lbm_mesapd_coupling::psm::gpu;
+typedef pystencils::PSMPackInfo PackInfo_T;
+
+///////////
+// FLAGS //
+///////////
+
+const FlagUID Fluid_Flag("Fluid");
+const FlagUID NoSlip_Flag("NoSlip");
+const FlagUID Inflow_Flag("Inflow");
+const FlagUID Outflow_Flag("Outflow");
+
+void createPlane(const shared_ptr< mesa_pd::data::ParticleStorage >& ps,
+                 const shared_ptr< mesa_pd::data::ShapeStorage >& ss, Vector3< real_t > position,
+                 Vector3< real_t > normal)
+{
+   mesa_pd::data::Particle&& p0 = *ps->create(true);
+   p0.setPosition(position);
+   p0.setInteractionRadius(std::numeric_limits< real_t >::infinity());
+   p0.setShapeID(ss->create< mesa_pd::data::HalfSpace >(normal));
+   p0.setOwner(mpi::MPIManager::instance()->rank());
+   p0.setType(0);
+   mesa_pd::data::particle_flags::set(p0.getFlagsRef(), mesa_pd::data::particle_flags::INFINITE);
+   mesa_pd::data::particle_flags::set(p0.getFlagsRef(), mesa_pd::data::particle_flags::FIXED);
+}
+
+void createPlaneSetup(const shared_ptr< mesa_pd::data::ParticleStorage >& ps,
+                      const shared_ptr< mesa_pd::data::ShapeStorage >& ss, const math::AABB& simulationDomain,
+                      bool periodicInX, bool periodicInY, real_t offsetAtInflow, real_t offsetAtOutflow)
+{
+   createPlane(ps, ss, simulationDomain.minCorner() + Vector3< real_t >(0, 0, offsetAtInflow),
+               Vector3< real_t >(0, 0, 1));
+   createPlane(ps, ss, simulationDomain.maxCorner() + Vector3< real_t >(0, 0, offsetAtOutflow),
+               Vector3< real_t >(0, 0, -1));
+
+   if (!periodicInX)
+   {
+      createPlane(ps, ss, simulationDomain.minCorner(), Vector3< real_t >(1, 0, 0));
+      createPlane(ps, ss, simulationDomain.maxCorner(), Vector3< real_t >(-1, 0, 0));
+   }
+
+   if (!periodicInY)
+   {
+      createPlane(ps, ss, simulationDomain.minCorner(), Vector3< real_t >(0, 1, 0));
+      createPlane(ps, ss, simulationDomain.maxCorner(), Vector3< real_t >(0, -1, 0));
+   }
+}
+
+struct ParticleInfo
+{
+   real_t averageVelocity = 0_r;
+   real_t maximumVelocity = 0_r;
+   uint_t numParticles    = 0;
+   real_t maximumHeight   = 0_r;
+   real_t particleVolume  = 0_r;
+   real_t heightOfMass    = 0_r;
+
+   void allReduce()
+   {
+      walberla::mpi::allReduceInplace(numParticles, walberla::mpi::SUM);
+      walberla::mpi::allReduceInplace(averageVelocity, walberla::mpi::SUM);
+      walberla::mpi::allReduceInplace(maximumVelocity, walberla::mpi::MAX);
+      walberla::mpi::allReduceInplace(maximumHeight, walberla::mpi::MAX);
+      walberla::mpi::allReduceInplace(particleVolume, walberla::mpi::SUM);
+      walberla::mpi::allReduceInplace(heightOfMass, walberla::mpi::SUM);
+
+      averageVelocity /= real_c(numParticles);
+      heightOfMass /= particleVolume;
+   }
+};
+
+std::ostream& operator<<(std::ostream& os, ParticleInfo const& m)
+{
+   return os << "Particle Info: uAvg = " << m.averageVelocity << ", uMax = " << m.maximumVelocity
+             << ", numParticles = " << m.numParticles << ", zMax = " << m.maximumHeight << ", Vp = " << m.particleVolume
+             << ", zMass = " << m.heightOfMass;
+}
+
+template< typename Accessor_T >
+ParticleInfo evaluateParticleInfo(const Accessor_T& ac)
+{
+   static_assert(std::is_base_of< mesa_pd::data::IAccessor, Accessor_T >::value, "Provide a valid accessor");
+
+   ParticleInfo info;
+   for (uint_t i = 0; i < ac.size(); ++i)
+   {
+      if (isSet(ac.getFlags(i), mesa_pd::data::particle_flags::GHOST)) continue;
+      if (isSet(ac.getFlags(i), mesa_pd::data::particle_flags::GLOBAL)) continue;
+
+      ++info.numParticles;
+      real_t velMagnitude   = ac.getLinearVelocity(i).length();
+      real_t particleVolume = ac.getShape(i)->getVolume();
+      real_t height         = ac.getPosition(i)[2];
+      info.averageVelocity += velMagnitude;
+      info.maximumVelocity = std::max(info.maximumVelocity, velMagnitude);
+      info.maximumHeight   = std::max(info.maximumHeight, height);
+      info.particleVolume += particleVolume;
+      info.heightOfMass += particleVolume * height;
+   }
+
+   info.allReduce();
+
+   return info;
+}
+
+struct FluidInfo
+{
+   uint_t numFluidCells   = 0;
+   real_t averageVelocity = 0_r;
+   real_t maximumVelocity = 0_r;
+   real_t averageDensity  = 0_r;
+   real_t maximumDensity  = 0_r;
+
+   void allReduce()
+   {
+      walberla::mpi::allReduceInplace(numFluidCells, walberla::mpi::SUM);
+      walberla::mpi::allReduceInplace(averageVelocity, walberla::mpi::SUM);
+      walberla::mpi::allReduceInplace(maximumVelocity, walberla::mpi::MAX);
+      ;
+      walberla::mpi::allReduceInplace(averageDensity, walberla::mpi::SUM);
+      walberla::mpi::allReduceInplace(maximumDensity, walberla::mpi::MAX);
+
+      averageVelocity /= real_c(numFluidCells);
+      averageDensity /= real_c(numFluidCells);
+   }
+};
+
+std::ostream& operator<<(std::ostream& os, FluidInfo const& m)
+{
+   return os << "Fluid Info: numFluidCells = " << m.numFluidCells << ", uAvg = " << m.averageVelocity
+             << ", uMax = " << m.maximumVelocity << ", densityAvg = " << m.averageDensity
+             << ", densityMax = " << m.maximumDensity;
+}
+
+FluidInfo evaluateFluidInfo(const shared_ptr< StructuredBlockStorage >& blocks, const BlockDataID& densityFieldID,
+                            const BlockDataID& velocityFieldID)
+{
+   FluidInfo info;
+
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      auto densityField  = blockIt->getData< DensityField_T >(densityFieldID);
+      auto velocityField = blockIt->getData< VelocityField_T >(velocityFieldID);
+
+      WALBERLA_FOR_ALL_CELLS_XYZ(
+         densityField, ++info.numFluidCells; Vector3< real_t > velocity(
+            velocityField->get(x, y, z, 0), velocityField->get(x, y, z, 1), velocityField->get(x, y, z, 2));
+         real_t density = densityField->get(x, y, z); real_t velMagnitude = velocity.length();
+         info.averageVelocity += velMagnitude; info.maximumVelocity = std::max(info.maximumVelocity, velMagnitude);
+         info.averageDensity += density; info.maximumDensity        = std::max(info.maximumDensity, density);)
+   }
+   info.allReduce();
+   return info;
+}
+
+//////////
+// MAIN //
+//////////
+
+//*******************************************************************************************************************
+/*!\brief Basic simulation of a fluidization setup
+ *
+ * Initially, the mono-sized sphere are created on a structured grid inside the domain.
+ * The domain is either periodic or bounded by walls in the horizontal directions (x and y).
+ * In z-direction, a constant inflow from below is provided
+ * and a pressure boundary condition is set at the top, resembling an outflow boundary.
+ *
+ * The simulation is run for the given number of seconds (runtime).
+ *
+ * All parameters should be set via the input file.
+ *
+ * For the overall algorithm and the different model parameters, see
+ * Rettinger, Rüde - An efficient four-way coupled lattice Boltzmann - discrete element method for
+ * fully resolved simulations of particle-laden flows (2020, preprint: https://arxiv.org/abs/2003.01490)
+ *
+ */
+//*******************************************************************************************************************
+int main(int argc, char** argv)
+{
+   Environment env(argc, argv);
+   gpu::selectDeviceBasedOnMpiRank();
+
+   auto cfgFile = env.config();
+   if (!cfgFile) { WALBERLA_ABORT("Usage: " << argv[0] << " path-to-configuration-file \n"); }
+
+   WALBERLA_LOG_INFO_ON_ROOT("waLBerla revision: " << std::string(WALBERLA_GIT_SHA1).substr(0, 8));
+   WALBERLA_LOG_INFO_ON_ROOT("compiler flags: " << std::string(WALBERLA_COMPILER_FLAGS));
+   WALBERLA_LOG_INFO_ON_ROOT("build machine: " << std::string(WALBERLA_BUILD_MACHINE));
+   WALBERLA_LOG_INFO_ON_ROOT(*cfgFile);
+
+   // read all parameters from the config file
+
+   Config::BlockHandle physicalSetup         = cfgFile->getBlock("PhysicalSetup");
+   const real_t xSize_SI                     = physicalSetup.getParameter< real_t >("xSize");
+   const real_t ySize_SI                     = physicalSetup.getParameter< real_t >("ySize");
+   const real_t zSize_SI                     = physicalSetup.getParameter< real_t >("zSize");
+   const bool periodicInX                    = physicalSetup.getParameter< bool >("periodicInX");
+   const bool periodicInY                    = physicalSetup.getParameter< bool >("periodicInY");
+   const real_t runtime_SI                   = physicalSetup.getParameter< real_t >("runtime");
+   const real_t uInflow_SI                   = physicalSetup.getParameter< real_t >("uInflow");
+   const real_t gravitationalAcceleration_SI = physicalSetup.getParameter< real_t >("gravitationalAcceleration");
+   const real_t kinematicViscosityFluid_SI   = physicalSetup.getParameter< real_t >("kinematicViscosityFluid");
+   const real_t densityFluid_SI              = physicalSetup.getParameter< real_t >("densityFluid");
+   const real_t particleDiameter_SI          = physicalSetup.getParameter< real_t >("particleDiameter");
+   const real_t densityParticle_SI           = physicalSetup.getParameter< real_t >("densityParticle");
+   const real_t dynamicFrictionCoefficient   = physicalSetup.getParameter< real_t >("dynamicFrictionCoefficient");
+   const real_t coefficientOfRestitution     = physicalSetup.getParameter< real_t >("coefficientOfRestitution");
+   const real_t collisionTimeFactor          = physicalSetup.getParameter< real_t >("collisionTimeFactor");
+   const real_t particleGenerationSpacing_SI = physicalSetup.getParameter< real_t >("particleGenerationSpacing");
+
+   Config::BlockHandle numericalSetup = cfgFile->getBlock("NumericalSetup");
+   const real_t dx_SI                 = numericalSetup.getParameter< real_t >("dx");
+   const real_t uInflow               = numericalSetup.getParameter< real_t >("uInflow");
+   const uint_t numXBlocks            = numericalSetup.getParameter< uint_t >("numXBlocks");
+   const uint_t numYBlocks            = numericalSetup.getParameter< uint_t >("numYBlocks");
+   const uint_t numZBlocks            = numericalSetup.getParameter< uint_t >("numZBlocks");
+   WALBERLA_CHECK_EQUAL(numXBlocks * numYBlocks * numZBlocks, uint_t(MPIManager::instance()->numProcesses()),
+                        "When using GPUs, the number of blocks ("
+                           << numXBlocks * numYBlocks * numZBlocks << ") has to match the number of MPI processes ("
+                           << uint_t(MPIManager::instance()->numProcesses()) << ")");
+   if ((periodicInX && numXBlocks == 1) || (periodicInY && numYBlocks == 1))
+   {
+      WALBERLA_ABORT("The number of blocks must be greater than 1 in periodic dimensions.")
+   }
+   const bool useLubricationForces        = numericalSetup.getParameter< bool >("useLubricationForces");
+   const uint_t numberOfParticleSubCycles = numericalSetup.getParameter< uint_t >("numberOfParticleSubCycles");
+   const Vector3< uint_t > particleSubBlockSize =
+      numericalSetup.getParameter< Vector3< uint_t > >("particleSubBlockSize");
+   const real_t linkedCellWidthRation = numericalSetup.getParameter< real_t >("linkedCellWidthRation");
+   const bool particleBarriers        = numericalSetup.getParameter< bool >("particleBarriers");
+
+   Config::BlockHandle outputSetup      = cfgFile->getBlock("Output");
+   const real_t infoSpacing_SI          = outputSetup.getParameter< real_t >("infoSpacing");
+   const real_t vtkSpacingParticles_SI  = outputSetup.getParameter< real_t >("vtkSpacingParticles");
+   const real_t vtkSpacingFluid_SI      = outputSetup.getParameter< real_t >("vtkSpacingFluid");
+   const std::string vtkFolder          = outputSetup.getParameter< std::string >("vtkFolder");
+   const uint_t performanceLogFrequency = outputSetup.getParameter< uint_t >("performanceLogFrequency");
+
+   // convert SI units to simulation (LBM) units and check setup
+
+   Vector3< uint_t > domainSize(uint_c(std::ceil(xSize_SI / dx_SI)), uint_c(std::ceil(ySize_SI / dx_SI)),
+                                uint_c(std::ceil(zSize_SI / dx_SI)));
+   WALBERLA_CHECK_FLOAT_EQUAL(real_t(domainSize[0]) * dx_SI, xSize_SI, "domain size in x is not divisible by given dx");
+   WALBERLA_CHECK_FLOAT_EQUAL(real_t(domainSize[1]) * dx_SI, ySize_SI, "domain size in y is not divisible by given dx");
+   WALBERLA_CHECK_FLOAT_EQUAL(real_t(domainSize[2]) * dx_SI, zSize_SI, "domain size in z is not divisible by given dx");
+
+   Vector3< uint_t > cellsPerBlockPerDirection(domainSize[0] / numXBlocks, domainSize[1] / numYBlocks,
+                                               domainSize[2] / numZBlocks);
+
+   WALBERLA_CHECK_EQUAL(domainSize[0], cellsPerBlockPerDirection[0] * numXBlocks,
+                        "number of cells in x of " << domainSize[0]
+                                                   << " is not divisible by given number of blocks in x direction");
+   WALBERLA_CHECK_EQUAL(domainSize[1], cellsPerBlockPerDirection[1] * numYBlocks,
+                        "number of cells in y of " << domainSize[1]
+                                                   << " is not divisible by given number of blocks in y direction");
+   WALBERLA_CHECK_EQUAL(domainSize[2], cellsPerBlockPerDirection[2] * numZBlocks,
+                        "number of cells in z of " << domainSize[2]
+                                                   << " is not divisible by given number of blocks in z direction");
+
+   WALBERLA_CHECK_GREATER(
+      particleDiameter_SI / dx_SI, 5_r,
+      "Your numerical resolution is below 5 cells per diameter and thus too small for such simulations!");
+
+   const real_t densityRatio           = densityParticle_SI / densityFluid_SI;
+   const real_t ReynoldsNumberParticle = uInflow_SI * particleDiameter_SI / kinematicViscosityFluid_SI;
+   const real_t GalileiNumber = std::sqrt((densityRatio - 1_r) * particleDiameter_SI * gravitationalAcceleration_SI) *
+                                particleDiameter_SI / kinematicViscosityFluid_SI;
+
+   // in simulation units: dt = 1, dx = 1, densityFluid = 1
+
+   const real_t dt_SI                     = uInflow / uInflow_SI * dx_SI;
+   const real_t diameter                  = particleDiameter_SI / dx_SI;
+   const real_t particleGenerationSpacing = particleGenerationSpacing_SI / dx_SI;
+   const real_t viscosity                 = kinematicViscosityFluid_SI * dt_SI / (dx_SI * dx_SI);
+   const real_t omega                     = lbm::collision_model::omegaFromViscosity(viscosity);
+   const real_t gravitationalAcceleration = gravitationalAcceleration_SI * dt_SI * dt_SI / dx_SI;
+   const real_t particleVolume            = math::pi / 6_r * diameter * diameter * diameter;
+
+   const real_t densityFluid    = real_t(1);
+   const real_t densityParticle = densityRatio;
+   const real_t dx              = real_t(1);
+
+   const uint_t numTimeSteps        = uint_c(std::ceil(runtime_SI / dt_SI));
+   const uint_t infoSpacing         = uint_c(std::ceil(infoSpacing_SI / dt_SI));
+   const uint_t vtkSpacingParticles = uint_c(std::ceil(vtkSpacingParticles_SI / dt_SI));
+   const uint_t vtkSpacingFluid     = uint_c(std::ceil(vtkSpacingFluid_SI / dt_SI));
+
+   const Vector3< real_t > inflowVec(0_r, 0_r, uInflow);
+
+   const real_t poissonsRatio         = real_t(0.22);
+   const real_t kappa                 = real_t(2) * (real_t(1) - poissonsRatio) / (real_t(2) - poissonsRatio);
+   const real_t particleCollisionTime = collisionTimeFactor * diameter;
+
+   WALBERLA_LOG_INFO_ON_ROOT("Simulation setup:");
+   WALBERLA_LOG_INFO_ON_ROOT(" - particles: diameter = " << diameter << ", densityRatio = " << densityRatio);
+   WALBERLA_LOG_INFO_ON_ROOT(" - fluid: kin. visc = " << viscosity << ", relaxation rate = " << omega);
+   WALBERLA_LOG_INFO_ON_ROOT(" - grav. acceleration = " << gravitationalAcceleration);
+   WALBERLA_LOG_INFO_ON_ROOT(" - Galileo number = " << GalileiNumber);
+   WALBERLA_LOG_INFO_ON_ROOT(" - particle Reynolds number = " << ReynoldsNumberParticle);
+   WALBERLA_LOG_INFO_ON_ROOT(" - domain size = " << domainSize);
+   WALBERLA_LOG_INFO_ON_ROOT(" - cells per blocks per direction = " << cellsPerBlockPerDirection);
+   WALBERLA_LOG_INFO_ON_ROOT(" - dx = " << dx_SI << " m");
+   WALBERLA_LOG_INFO_ON_ROOT(" - dt = " << dt_SI << " s");
+   WALBERLA_LOG_INFO_ON_ROOT(" - total time steps = " << numTimeSteps);
+   WALBERLA_LOG_INFO_ON_ROOT(" - particle generation spacing = " << particleGenerationSpacing);
+   WALBERLA_LOG_INFO_ON_ROOT(" - info spacing = " << infoSpacing);
+   WALBERLA_LOG_INFO_ON_ROOT(" - vtk spacing particles = " << vtkSpacingParticles
+                                                           << ", fluid slice = " << vtkSpacingFluid);
+
+   ///////////////////////////
+   // BLOCK STRUCTURE SETUP //
+   ///////////////////////////
+
+   const bool periodicInZ                     = false;
+   shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid(
+      numXBlocks, numYBlocks, numZBlocks, cellsPerBlockPerDirection[0], cellsPerBlockPerDirection[1],
+      cellsPerBlockPerDirection[2], dx, 0, false, false, periodicInX, periodicInY, periodicInZ, // periodicity
+      false);
+
+   auto simulationDomain = blocks->getDomain();
+
+   //////////////////
+   // RPD COUPLING //
+   //////////////////
+
+   auto rpdDomain = std::make_shared< mesa_pd::domain::BlockForestDomain >(blocks->getBlockForestPointer());
+
+   // init data structures
+   auto ps                  = walberla::make_shared< mesa_pd::data::ParticleStorage >(1);
+   auto ss                  = walberla::make_shared< mesa_pd::data::ShapeStorage >();
+   using ParticleAccessor_T = mesa_pd::data::ParticleAccessorWithShape;
+   auto accessor            = walberla::make_shared< ParticleAccessor_T >(ps, ss);
+
+   // prevent particles from interfering with inflow and outflow by putting the bounding planes slightly in front
+   const real_t planeOffsetFromInflow  = dx;
+   const real_t planeOffsetFromOutflow = dx;
+   createPlaneSetup(ps, ss, simulationDomain, periodicInX, periodicInY, planeOffsetFromInflow, planeOffsetFromOutflow);
+
+   auto sphereShape = ss->create< mesa_pd::data::Sphere >(diameter * real_t(0.5));
+   ss->shapes[sphereShape]->updateMassAndInertia(densityParticle);
+
+   // create spheres
+   auto generationDomain = simulationDomain.getExtended(-particleGenerationSpacing * 0.5_r);
+   for (auto pt : grid_generator::SCGrid(generationDomain, generationDomain.center(), particleGenerationSpacing))
+   {
+      if (rpdDomain->isContainedInProcessSubdomain(uint_c(mpi::MPIManager::instance()->rank()), pt))
+      {
+         mesa_pd::data::Particle&& p = *ps->create();
+         p.setPosition(pt);
+         p.setInteractionRadius(diameter * real_t(0.5));
+         p.setOwner(mpi::MPIManager::instance()->rank());
+         p.setShapeID(sphereShape);
+         p.setType(1);
+         p.setLinearVelocity(0.1_r * Vector3< real_t >(math::realRandom(
+                                        -uInflow, uInflow))); // set small initial velocity to break symmetries
+      }
+   }
+
+   ///////////////////////
+   // ADD DATA TO BLOCKS //
+   ////////////////////////
+
+   // add PDF field
+   BlockDataID pdfFieldID =
+      field::addToStorage< PdfField_T >(blocks, "pdf field (fzyx)", real_c(std::nan("")), field::fzyx);
+   BlockDataID pdfFieldGPUID = gpu::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldID, "pdf field GPU");
+
+   BlockDataID densityFieldID = field::addToStorage< DensityField_T >(blocks, "Density", real_t(0), field::fzyx);
+   BlockDataID velFieldID     = field::addToStorage< VelocityField_T >(blocks, "Velocity", real_t(0), field::fzyx);
+
+   BlockDataID BFieldID =
+      field::addToStorage< lbm_mesapd_coupling::psm::gpu::BField_T >(blocks, "B field", 0, field::fzyx, 1);
+
+   // add flag field
+   BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+
+   // set up RPD functionality
+   std::function< void(void) > syncCall = [&ps, &rpdDomain]() {
+      // keep overlap for lubrication
+      const real_t overlap = real_t(1.5);
+      mesa_pd::mpi::SyncNextNeighbors syncNextNeighborFunc;
+      syncNextNeighborFunc(*ps, *rpdDomain, overlap);
+   };
+
+   syncCall();
+
+   real_t timeStepSizeRPD = real_t(1) / real_t(numberOfParticleSubCycles);
+   mesa_pd::kernel::VelocityVerletPreForceUpdate vvIntegratorPreForce(timeStepSizeRPD);
+   mesa_pd::kernel::VelocityVerletPostForceUpdate vvIntegratorPostForce(timeStepSizeRPD);
+   mesa_pd::kernel::LinearSpringDashpot collisionResponse(2);
+   collisionResponse.setFrictionCoefficientDynamic(0, 1, dynamicFrictionCoefficient);
+   collisionResponse.setFrictionCoefficientDynamic(1, 1, dynamicFrictionCoefficient);
+   real_t massSphere       = densityParticle * particleVolume;
+   real_t meffSpherePlane  = massSphere;
+   real_t meffSphereSphere = massSphere * massSphere / (real_t(2) * massSphere);
+   collisionResponse.setStiffnessAndDamping(0, 1, coefficientOfRestitution, particleCollisionTime, kappa,
+                                            meffSpherePlane);
+   collisionResponse.setStiffnessAndDamping(1, 1, coefficientOfRestitution, particleCollisionTime, kappa,
+                                            meffSphereSphere);
+   mesa_pd::kernel::AssocToBlock assoc(blocks->getBlockForestPointer());
+   mesa_pd::mpi::ReduceProperty reduceProperty;
+   mesa_pd::mpi::ReduceContactHistory reduceAndSwapContactHistory;
+
+   // set up coupling functionality
+   Vector3< real_t > gravitationalForce(real_t(0), real_t(0),
+                                        -(densityParticle - densityFluid) * gravitationalAcceleration * particleVolume);
+   lbm_mesapd_coupling::AddForceOnParticlesKernel addGravitationalForce(gravitationalForce);
+   lbm_mesapd_coupling::ResetHydrodynamicForceTorqueKernel resetHydrodynamicForceTorque;
+   lbm_mesapd_coupling::AverageHydrodynamicForceTorqueKernel averageHydrodynamicForceTorque;
+   lbm_mesapd_coupling::LubricationCorrectionKernel lubricationCorrectionKernel(
+      viscosity, [](real_t r) { return (real_t(0.001 + real_t(0.00007) * r)) * r; });
+
+   // assemble boundary block string
+   std::string boundariesBlockString = " Boundaries"
+                                       "{"
+                                       "Border { direction T;    walldistance -1;  flag Outflow; }"
+                                       "Border { direction B;    walldistance -1;  flag Inflow; }";
+
+   if (!periodicInX)
+   {
+      boundariesBlockString += "Border { direction W;    walldistance -1;  flag NoSlip; }"
+                               "Border { direction E;    walldistance -1;  flag NoSlip; }";
+   }
+
+   if (!periodicInY)
+   {
+      boundariesBlockString += "Border { direction S;    walldistance -1;  flag NoSlip; }"
+                               "Border { direction N;    walldistance -1;  flag NoSlip; }";
+   }
+
+   boundariesBlockString += "}";
+   WALBERLA_ROOT_SECTION()
+   {
+      std::ofstream boundariesFile("boundaries.prm");
+      boundariesFile << boundariesBlockString;
+      boundariesFile.close();
+   }
+   WALBERLA_MPI_BARRIER()
+
+   auto boundariesCfgFile = Config();
+   boundariesCfgFile.readParameterFile("boundaries.prm");
+   auto boundariesConfig = boundariesCfgFile.getBlock("Boundaries");
+
+   // map boundaries into the LBM simulation
+   geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, Fluid_Flag);
+   lbm::PSM_NoSlip noSlip(blocks, pdfFieldGPUID);
+   noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, NoSlip_Flag, Fluid_Flag);
+   lbm::PSM_UBB ubb(blocks, pdfFieldGPUID, inflowVec[0], inflowVec[1], inflowVec[2]);
+   ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldID, Inflow_Flag, Fluid_Flag);
+   lbm::PSM_Density density_bc(blocks, pdfFieldGPUID, real_t(1));
+   density_bc.fillFromFlagField< FlagField_T >(blocks, flagFieldID, Outflow_Flag, Fluid_Flag);
+
+   ///////////////
+   // TIME LOOP //
+   ///////////////
+
+   // map particles into the LBM simulation
+   // note: planes are not mapped and are thus only visible to the particles, not to the fluid
+   // instead, the respective boundary conditions for the fluid are explicitly set, see the boundary handling
+   ParticleAndVolumeFractionSoA_T< 1 > particleAndVolumeFractionSoA(blocks, omega);
+   PSMSweepCollection psmSweepCollection(blocks, accessor, lbm_mesapd_coupling::RegularParticlesSelector(),
+                                            particleAndVolumeFractionSoA, particleSubBlockSize);
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      psmSweepCollection.particleMappingSweep(&(*blockIt));
+   }
+
+   pystencils::InitializeDomainForPSM pdfSetter(
+      particleAndVolumeFractionSoA.BsFieldID, particleAndVolumeFractionSoA.BFieldID,
+      particleAndVolumeFractionSoA.particleVelocitiesFieldID, pdfFieldGPUID, real_t(0), real_t(0), real_t(0),
+      real_t(1.0), real_t(0), real_t(0), real_t(0));
+
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      // pdfSetter requires particle velocities at cell centers
+      psmSweepCollection.setParticleVelocitiesSweep(&(*blockIt));
+      pdfSetter(&(*blockIt));
+   }
+
+   // setup of the LBM communication for synchronizing the pdf field between neighboring blocks
+   gpu::communication::UniformGPUScheme< Stencil_T > com(blocks, true, false);
+   com.addPackInfo(make_shared< PackInfo_T >(pdfFieldGPUID));
+   auto communication = std::function< void() >([&]() { com.communicate(); });
+
+   // create the timeloop
+   SweepTimeloop commTimeloop(blocks->getBlockStorage(), numTimeSteps);
+   SweepTimeloop timeloop(blocks->getBlockStorage(), numTimeSteps);
+
+   timeloop.addFuncBeforeTimeStep(RemainingTimeLogger(timeloop.getNrOfTimeSteps()), "Remaining Time Logger");
+
+   pystencils::PSM_MacroGetter getterSweep(BFieldID, densityFieldID, pdfFieldID, velFieldID, real_t(0.0), real_t(0.0),
+                                           real_t(0.0));
+   // vtk output
+   if (vtkSpacingParticles != uint_t(0))
+   {
+      // sphere
+      auto particleVtkOutput = make_shared< mesa_pd::vtk::ParticleVtkOutput >(ps);
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleUid >("uid");
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleLinearVelocity >("velocity");
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleInteractionRadius >("radius");
+      // limit output to process-local spheres
+      particleVtkOutput->setParticleSelector([sphereShape](const mesa_pd::data::ParticleStorage::iterator& pIt) {
+         return pIt->getShapeID() == sphereShape &&
+                !(mesa_pd::data::particle_flags::isSet(pIt->getFlags(), mesa_pd::data::particle_flags::GHOST));
+      });
+      auto particleVtkWriter =
+         vtk::createVTKOutput_PointData(particleVtkOutput, "particles", vtkSpacingParticles, vtkFolder);
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(particleVtkWriter), "VTK (sphere data)");
+   }
+
+   if (vtkSpacingFluid != uint_t(0))
+   {
+      // velocity field, only a slice
+      auto pdfFieldVTK = vtk::createVTKOutput_BlockData(blocks, "fluid", vtkSpacingFluid, 0, false, vtkFolder);
+
+      pdfFieldVTK->addBeforeFunction(communication);
+
+      pdfFieldVTK->addBeforeFunction([&]() {
+         gpu::fieldCpy< PdfField_T, gpu::GPUField< real_t > >(blocks, pdfFieldID, pdfFieldGPUID);
+         gpu::fieldCpy< GhostLayerField< real_t, 1 >, BFieldGPU_T >(blocks, BFieldID,
+                                                                    particleAndVolumeFractionSoA.BFieldID);
+         for (auto& block : *blocks)
+            getterSweep(&block);
+      });
+
+      AABB sliceAABB(real_t(0), real_c(domainSize[1]) * real_t(0.5) - real_t(1), real_t(0), real_c(domainSize[0]),
+                     real_c(domainSize[1]) * real_t(0.5) + real_t(1), real_c(domainSize[2]));
+      vtk::AABBCellFilter aabbSliceFilter(sliceAABB);
+
+      field::FlagFieldCellFilter< FlagField_T > fluidFilter(flagFieldID);
+      fluidFilter.addFlag(Fluid_Flag);
+
+      vtk::ChainedFilter combinedSliceFilter;
+      combinedSliceFilter.addFilter(fluidFilter);
+      combinedSliceFilter.addFilter(aabbSliceFilter);
+
+      pdfFieldVTK->addCellInclusionFilter(combinedSliceFilter);
+
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< VelocityField_T > >(velFieldID, "Velocity"));
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< DensityField_T > >(densityFieldID, "Density"));
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< BField_T > >(BFieldID, "Fraction mapping field B"));
+
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(pdfFieldVTK), "VTK (fluid field data)");
+   }
+
+   if (vtkSpacingFluid != uint_t(0) || vtkSpacingParticles != uint_t(0))
+   {
+      vtk::writeDomainDecomposition(blocks, "domain_decomposition", vtkFolder);
+   }
+
+   // add performance logging
+   const lbm::PerformanceLogger< FlagField_T > performanceLogger(blocks, flagFieldID, Fluid_Flag,
+                                                                 performanceLogFrequency);
+   timeloop.addFuncAfterTimeStep(performanceLogger, "Evaluate performance logging");
+
+   // add LBM communication function and boundary handling sweep
+   timeloop.add() << Sweep(deviceSyncWrapper(ubb.getSweep()), "Boundary Handling (UBB)");
+   timeloop.add() << Sweep(deviceSyncWrapper(density_bc.getSweep()), "Boundary Handling (Density)");
+   timeloop.add() << Sweep(deviceSyncWrapper(noSlip.getSweep()), "Boundary Handling (NoSlip)");
+
+   // stream + collide LBM step
+   pystencils::PSMSweepSplit PSMSweep(particleAndVolumeFractionSoA.BsFieldID, particleAndVolumeFractionSoA.BFieldID,
+                                      particleAndVolumeFractionSoA.particleForcesFieldID,
+                                      particleAndVolumeFractionSoA.particleVelocitiesFieldID, pdfFieldGPUID,
+                                      real_t(0.0), real_t(0.0), real_t(0.0), omega);
+   addPSMSweepsToTimeloops(commTimeloop, timeloop, com, psmSweepCollection, PSMSweep);
+
+   ////////////////////////
+   // EXECUTE SIMULATION //
+   ////////////////////////
+
+   WcTimingPool timeloopTiming;
+   const bool useOpenMP = true;
+
+   real_t linkedCellWidth = linkedCellWidthRation * diameter;
+   mesa_pd::data::LinkedCells linkedCells(rpdDomain->getUnionOfLocalAABBs().getExtended(linkedCellWidth),
+                                          linkedCellWidth);
+   mesa_pd::kernel::InsertParticleIntoLinkedCells ipilc;
+
+   // time loop
+   for (uint_t timeStep = 0; timeStep < numTimeSteps; ++timeStep)
+   {
+      // perform a single simulation step -> this contains LBM and setting of the hydrodynamic interactions
+      commTimeloop.singleStep(timeloopTiming);
+      timeloop.singleStep(timeloopTiming);
+
+      if (particleBarriers) WALBERLA_MPI_BARRIER();
+      timeloopTiming["RPD forEachParticle assoc"].start();
+      ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, assoc, *accessor);
+      if (particleBarriers) WALBERLA_MPI_BARRIER();
+      timeloopTiming["RPD forEachParticle assoc"].end();
+      timeloopTiming["RPD reduceProperty HydrodynamicForceTorqueNotification"].start();
+      reduceProperty.operator()< mesa_pd::HydrodynamicForceTorqueNotification >(*ps);
+      if (particleBarriers) WALBERLA_MPI_BARRIER();
+      timeloopTiming["RPD reduceProperty HydrodynamicForceTorqueNotification"].end();
+
+      if (timeStep == 0)
+      {
+         lbm_mesapd_coupling::InitializeHydrodynamicForceTorqueForAveragingKernel
+            initializeHydrodynamicForceTorqueForAveragingKernel;
+         timeloopTiming["RPD forEachParticle initializeHydrodynamicForceTorqueForAveragingKernel"].start();
+         ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor,
+                             initializeHydrodynamicForceTorqueForAveragingKernel, *accessor);
+         if (particleBarriers) WALBERLA_MPI_BARRIER();
+         timeloopTiming["RPD forEachParticle initializeHydrodynamicForceTorqueForAveragingKernel"].end();
+      }
+      timeloopTiming["RPD forEachParticle averageHydrodynamicForceTorque"].start();
+      ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, averageHydrodynamicForceTorque,
+                          *accessor);
+      if (particleBarriers) WALBERLA_MPI_BARRIER();
+      timeloopTiming["RPD forEachParticle averageHydrodynamicForceTorque"].end();
+
+      for (auto subCycle = uint_t(0); subCycle < numberOfParticleSubCycles; ++subCycle)
+      {
+         timeloopTiming["RPD forEachParticle vvIntegratorPreForce"].start();
+         ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, vvIntegratorPreForce, *accessor);
+         if (particleBarriers) WALBERLA_MPI_BARRIER();
+         timeloopTiming["RPD forEachParticle vvIntegratorPreForce"].end();
+         timeloopTiming["RPD syncCall"].start();
+         syncCall();
+         if (particleBarriers) WALBERLA_MPI_BARRIER();
+         timeloopTiming["RPD syncCall"].end();
+
+         timeloopTiming["RPD linkedCells.clear"].start();
+         linkedCells.clear();
+         if (particleBarriers) WALBERLA_MPI_BARRIER();
+         timeloopTiming["RPD linkedCells.clear"].end();
+         timeloopTiming["RPD forEachParticle ipilc"].start();
+         ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectAll(), *accessor, ipilc, *accessor, linkedCells);
+         if (particleBarriers) WALBERLA_MPI_BARRIER();
+         timeloopTiming["RPD forEachParticle ipilc"].end();
+
+         if (useLubricationForces)
+         {
+            // lubrication correction
+            timeloopTiming["RPD forEachParticlePairHalf lubricationCorrectionKernel"].start();
+            linkedCells.forEachParticlePairHalf(
+               useOpenMP, mesa_pd::kernel::ExcludeInfiniteInfinite(), *accessor,
+               [&lubricationCorrectionKernel, &rpdDomain](const size_t idx1, const size_t idx2, auto& ac) {
+                  mesa_pd::collision_detection::AnalyticContactDetection acd;
+                  acd.getContactThreshold() = lubricationCorrectionKernel.getNormalCutOffDistance();
+                  mesa_pd::kernel::DoubleCast double_cast;
+                  mesa_pd::mpi::ContactFilter contact_filter;
+                  if (double_cast(idx1, idx2, ac, acd, ac))
+                  {
+                     if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *rpdDomain))
+                     {
+                        double_cast(acd.getIdx1(), acd.getIdx2(), ac, lubricationCorrectionKernel, ac,
+                                    acd.getContactNormal(), acd.getPenetrationDepth());
+                     }
+                  }
+               },
+               *accessor);
+            if (particleBarriers) WALBERLA_MPI_BARRIER();
+            timeloopTiming["RPD forEachParticlePairHalf lubricationCorrectionKernel"].end();
+         }
+
+         // collision response
+         timeloopTiming["RPD forEachParticlePairHalf collisionResponse"].start();
+         linkedCells.forEachParticlePairHalf(
+            useOpenMP, mesa_pd::kernel::ExcludeInfiniteInfinite(), *accessor,
+            [&collisionResponse, &rpdDomain, timeStepSizeRPD](const size_t idx1, const size_t idx2, auto& ac) {
+               mesa_pd::collision_detection::AnalyticContactDetection acd;
+               mesa_pd::kernel::DoubleCast double_cast;
+               mesa_pd::mpi::ContactFilter contact_filter;
+               if (double_cast(idx1, idx2, ac, acd, ac))
+               {
+                  if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *rpdDomain))
+                  {
+                     collisionResponse(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), acd.getContactNormal(),
+                                       acd.getPenetrationDepth(), timeStepSizeRPD);
+                  }
+               }
+            },
+            *accessor);
+         if (particleBarriers) WALBERLA_MPI_BARRIER();
+         timeloopTiming["RPD forEachParticlePairHalf collisionResponse"].end();
+
+         timeloopTiming["RPD reduceProperty reduceAndSwapContactHistory"].start();
+         reduceAndSwapContactHistory(*ps);
+         if (particleBarriers) WALBERLA_MPI_BARRIER();
+         timeloopTiming["RPD reduceProperty reduceAndSwapContactHistory"].end();
+
+         // add hydrodynamic force
+         lbm_mesapd_coupling::AddHydrodynamicInteractionKernel addHydrodynamicInteraction;
+         timeloopTiming["RPD forEachParticle addHydrodynamicInteraction + addGravitationalForce"].start();
+         ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, addHydrodynamicInteraction,
+                             *accessor);
+
+         ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, addGravitationalForce, *accessor);
+         if (particleBarriers) WALBERLA_MPI_BARRIER();
+         timeloopTiming["RPD forEachParticle addHydrodynamicInteraction + addGravitationalForce"].end();
+
+         timeloopTiming["RPD reduceProperty ForceTorqueNotification"].start();
+         reduceProperty.operator()< mesa_pd::ForceTorqueNotification >(*ps);
+         if (particleBarriers) WALBERLA_MPI_BARRIER();
+         timeloopTiming["RPD reduceProperty ForceTorqueNotification"].end();
+
+         timeloopTiming["RPD forEachParticle vvIntegratorPostForce"].start();
+         ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, vvIntegratorPostForce, *accessor);
+         if (particleBarriers) WALBERLA_MPI_BARRIER();
+         timeloopTiming["RPD forEachParticle vvIntegratorPostForce"].end();
+      }
+
+      timeloopTiming["RPD syncCall"].start();
+      syncCall();
+      if (particleBarriers) WALBERLA_MPI_BARRIER();
+      timeloopTiming["RPD syncCall"].end();
+
+      timeloopTiming["RPD forEachParticle resetHydrodynamicForceTorque"].start();
+      ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectAll(), *accessor, resetHydrodynamicForceTorque, *accessor);
+      if (particleBarriers) WALBERLA_MPI_BARRIER();
+      timeloopTiming["RPD forEachParticle resetHydrodynamicForceTorque"].end();
+
+      if (infoSpacing != 0 && timeStep % infoSpacing == 0)
+      {
+         timeloopTiming["Evaluate infos"].start();
+
+         auto particleInfo = evaluateParticleInfo(*accessor);
+         WALBERLA_LOG_INFO_ON_ROOT(particleInfo);
+
+         auto fluidInfo = evaluateFluidInfo(blocks, densityFieldID, velFieldID);
+         WALBERLA_LOG_INFO_ON_ROOT(fluidInfo);
+         if (particleBarriers) WALBERLA_MPI_BARRIER();
+         timeloopTiming["Evaluate infos"].end();
+      }
+   }
+
+   timeloopTiming.logResultOnRoot();
+
+   return EXIT_SUCCESS;
+}
+
+} // namespace fluidized_bed
+
+int main(int argc, char** argv) { fluidized_bed::main(argc, argv); }
diff --git a/apps/benchmarks/FluidizedBed/input.prm b/apps/benchmarks/FluidizedBed/input.prm
new file mode 100644
index 0000000000000000000000000000000000000000..aa293e8609c90fa61cf982b5d69d998d5a6e66ec
--- /dev/null
+++ b/apps/benchmarks/FluidizedBed/input.prm
@@ -0,0 +1,54 @@
+PhysicalSetup // all to be specified in SI units!
+{
+    xSize 0.05; // = width
+    ySize 0.02; // = depth
+    zSize 0.08; // = height
+
+    periodicInX false;
+    periodicInY false;
+
+    runtime 0.1;
+
+    uInflow 0.005;
+    gravitationalAcceleration 9.81;
+
+    kinematicViscosityFluid 1e-5;
+    densityFluid 1000.;
+
+    particleDiameter 0.002;
+    densityParticle 1100.;
+    dynamicFrictionCoefficient 0.15;
+    coefficientOfRestitution 0.6;
+    collisionTimeFactor 1.0;
+
+    particleGenerationSpacing 0.00401; // 0.00401 or 0.00201
+}
+
+NumericalSetup
+{
+    dx 0.0001; // in m
+    uInflow 0.01; // in LBM units, should be smaller than 0.1, this then determines dt
+
+    // product of number of blocks should be equal to number of used processes
+    numXBlocks 1;
+    numYBlocks 1;
+    numZBlocks 1;
+
+    useLubricationForces true;
+    numberOfParticleSubCycles 10;
+
+    particleSubBlockSize <10, 10, 10>;
+    linkedCellWidthRation 1.01;
+    particleBarriers true;
+}
+
+Output
+{
+    infoSpacing 0.0; // in s
+
+    vtkSpacingParticles 0.0; // in s
+    vtkSpacingFluid 0.0; // in s
+    vtkFolder vtk_out;
+
+    performanceLogFrequency 500;
+}
diff --git a/apps/benchmarks/Percolation/CMakeLists.txt b/apps/benchmarks/Percolation/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bb8c4f3dc9aa15c71786f4a1bc0a9cc0d28c9664
--- /dev/null
+++ b/apps/benchmarks/Percolation/CMakeLists.txt
@@ -0,0 +1,9 @@
+waLBerla_link_files_to_builddir("*.prm")
+
+if (WALBERLA_BUILD_WITH_CODEGEN)
+    if (NOT WALBERLA_BUILD_WITH_GPU_SUPPORT OR (WALBERLA_BUILD_WITH_GPU_SUPPORT AND (CMAKE_CUDA_ARCHITECTURES GREATER_EQUAL 60 OR WALBERLA_BUILD_WITH_HIP)))
+        waLBerla_add_executable(NAME Percolation FILES Percolation.cpp
+                DEPENDS blockforest core field geometry gpu lbm lbm_mesapd_coupling mesa_pd sqlite vtk PSMCodegenPython_trt-smagorinsky_sc1)
+        target_compile_definitions(Percolation PRIVATE Weighting=2)
+    endif ()
+endif ()
diff --git a/apps/benchmarks/Percolation/Percolation.cpp b/apps/benchmarks/Percolation/Percolation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f778f7bc951582c2aa5ef04281c66801bd8c0728
--- /dev/null
+++ b/apps/benchmarks/Percolation/Percolation.cpp
@@ -0,0 +1,507 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Percolation.cpp
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/grid_generator/SCIterator.h"
+#include "core/logging/all.h"
+#include "core/timing/RemainingTimeLogger.h"
+
+#include "field/AddToStorage.h"
+#include "field/vtk/all.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/DeviceSelectMPI.h"
+#include "gpu/communication/UniformGPUScheme.h"
+
+#include "lbm/PerformanceLogger.h"
+#include "lbm/vtk/all.h"
+
+#include "lbm_mesapd_coupling/DataTypesCodegen.h"
+#include "lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMSweepCollection.h"
+#include "lbm_mesapd_coupling/utility/ParticleSelector.h"
+
+#include "mesa_pd/data/DataTypes.h"
+#include "mesa_pd/data/ParticleAccessorWithShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/ShapeStorage.h"
+#include "mesa_pd/data/shape/Sphere.h"
+#include "mesa_pd/domain/BlockForestDomain.h"
+#include "mesa_pd/mpi/SyncNextNeighbors.h"
+#include "mesa_pd/vtk/ParticleVtkOutput.h"
+
+#include "sqlite/SQLite.h"
+
+#include "vtk/all.h"
+
+#include "LBMSweep.h"
+#include "PSMPackInfo.h"
+#include "PSMSweep.h"
+#include "PSM_Density.h"
+#include "PSM_InfoHeader.h"
+#include "PSM_MacroGetter.h"
+
+namespace percolation
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+using namespace lbm_mesapd_coupling::psm::gpu;
+typedef pystencils::PSMPackInfo PackInfo_T;
+
+using flag_t      = walberla::uint8_t;
+using FlagField_T = FlagField< flag_t >;
+
+///////////
+// FLAGS //
+///////////
+
+const FlagUID Fluid_Flag("Fluid");
+const FlagUID Density_Flag("Density");
+const FlagUID NoSlip_Flag("NoSlip");
+const FlagUID Inflow_Flag("Inflow");
+
+//////////
+// MAIN //
+//////////
+
+//*******************************************************************************************************************
+/*!\brief Benchmark of a percolation setup
+ *
+ * This code can be used as a percolation (useParticles=true) or as a channel flow (useParticles=false) benchmark.
+ * A constant inflow from west is applied and a pressure boundary condition is set at the east.
+ * For the percolation, mono-sized fixed spherical particles are generated on a structured grid with an offset for
+ * every second particle layer in flow direction to avoid channels in flow direction. The flow is described by Darcy's
+ * law. For the channel flow, the flow is described by the Hagen–Poiseuille equation.
+ *
+ * The domain is either periodic or bounded by (no slip) walls in the vertical directions (y and z).
+ *
+ * For the percolation, the PSM is used in combination with a two-way coupling, but no particle dynamics.
+ * For the channel flow, only the LBM is used.
+ *
+ * The parameters can be changed via the input file.
+ *
+ */
+//*******************************************************************************************************************
+int main(int argc, char** argv)
+{
+   Environment env(argc, argv);
+   auto cfgFile = env.config();
+   if (!cfgFile) { WALBERLA_ABORT("Usage: " << argv[0] << " path-to-configuration-file \n"); }
+
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   gpu::selectDeviceBasedOnMpiRank();
+#endif
+
+   WALBERLA_LOG_INFO_ON_ROOT("waLBerla revision: " << std::string(WALBERLA_GIT_SHA1).substr(0, 8));
+   WALBERLA_LOG_INFO_ON_ROOT("compiler flags: " << std::string(WALBERLA_COMPILER_FLAGS));
+   WALBERLA_LOG_INFO_ON_ROOT("build machine: " << std::string(WALBERLA_BUILD_MACHINE));
+   WALBERLA_LOG_INFO_ON_ROOT(*cfgFile);
+
+   // Read config file
+   Config::BlockHandle numericalSetup = cfgFile->getBlock("NumericalSetup");
+   const uint_t numXBlocks            = numericalSetup.getParameter< uint_t >("numXBlocks");
+   const uint_t numYBlocks            = numericalSetup.getParameter< uint_t >("numYBlocks");
+   const uint_t numZBlocks            = numericalSetup.getParameter< uint_t >("numZBlocks");
+   WALBERLA_CHECK_EQUAL(numXBlocks * numYBlocks * numZBlocks, uint_t(MPIManager::instance()->numProcesses()),
+                        "When using GPUs, the number of blocks ("
+                           << numXBlocks * numYBlocks * numZBlocks << ") has to match the number of MPI processes ("
+                           << uint_t(MPIManager::instance()->numProcesses()) << ")");
+   const bool periodicInY                 = numericalSetup.getParameter< bool >("periodicInY");
+   const bool periodicInZ                 = numericalSetup.getParameter< bool >("periodicInZ");
+   const uint_t numXCellsPerBlock         = numericalSetup.getParameter< uint_t >("numXCellsPerBlock");
+   const uint_t numYCellsPerBlock         = numericalSetup.getParameter< uint_t >("numYCellsPerBlock");
+   const uint_t numZCellsPerBlock         = numericalSetup.getParameter< uint_t >("numZCellsPerBlock");
+   const bool sendDirectlyFromGPU         = numericalSetup.getParameter< bool >("sendDirectlyFromGPU");
+   const bool useCommunicationHiding      = numericalSetup.getParameter< bool >("useCommunicationHiding");
+   const Vector3< uint_t > frameWidth     = numericalSetup.getParameter< Vector3< uint_t > >("frameWidth");
+   const uint_t timeSteps                 = numericalSetup.getParameter< uint_t >("timeSteps");
+   const bool useParticles                = numericalSetup.getParameter< bool >("useParticles");
+   const real_t particleDiameter          = numericalSetup.getParameter< real_t >("particleDiameter");
+   const real_t particleGenerationSpacing = numericalSetup.getParameter< real_t >("particleGenerationSpacing");
+   const Vector3< real_t > generationDomainFraction =
+      numericalSetup.getParameter< Vector3< real_t > >("generationDomainFraction");
+   const Vector3< uint_t > generationPointOfReferenceOffset =
+      numericalSetup.getParameter< Vector3< uint_t > >("generationPointOfReferenceOffset");
+   const bool useParticleOffset = numericalSetup.getParameter< bool >("useParticleOffset");
+   const Vector3< uint_t > particleSubBlockSize =
+      numericalSetup.getParameter< Vector3< uint_t > >("particleSubBlockSize");
+   const real_t uInflow        = numericalSetup.getParameter< real_t >("uInflow");
+   const real_t relaxationRate = numericalSetup.getParameter< real_t >("relaxationRate");
+   if ((periodicInY && numYBlocks == 1) || (periodicInZ && numZBlocks == 1))
+   {
+      WALBERLA_LOG_WARNING_ON_ROOT("Using only 1 block in periodic dimensions can lead to unexpected behavior.")
+   }
+   const real_t viscosity = lbm::collision_model::viscosityFromOmega(relaxationRate);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(viscosity)
+
+   Config::BlockHandle outputSetup      = cfgFile->getBlock("Output");
+   const uint_t vtkSpacing              = outputSetup.getParameter< uint_t >("vtkSpacing");
+   const std::string vtkFolder          = outputSetup.getParameter< std::string >("vtkFolder");
+   const uint_t performanceLogFrequency = outputSetup.getParameter< uint_t >("performanceLogFrequency");
+
+   ///////////////////////////
+   // BLOCK STRUCTURE SETUP //
+   ///////////////////////////
+
+   const bool periodicInX                     = false;
+   shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid(
+      numXBlocks, numYBlocks, numZBlocks, numXCellsPerBlock, numYCellsPerBlock, numZCellsPerBlock, real_t(1), uint_t(0),
+      false, false, periodicInX, periodicInY, periodicInZ, // periodicity
+      false);
+
+   auto simulationDomain = blocks->getDomain();
+
+   ////////////
+   // MesaPD //
+   ////////////
+
+   auto rpdDomain = std::make_shared< mesa_pd::domain::BlockForestDomain >(blocks->getBlockForestPointer());
+
+   // Init data structures
+   auto ps                  = walberla::make_shared< mesa_pd::data::ParticleStorage >(1);
+   auto ss                  = walberla::make_shared< mesa_pd::data::ShapeStorage >();
+   using ParticleAccessor_T = mesa_pd::data::ParticleAccessorWithShape;
+   auto accessor            = walberla::make_shared< ParticleAccessor_T >(ps, ss);
+   auto sphereShape         = ss->create< mesa_pd::data::Sphere >(particleDiameter * real_t(0.5));
+
+   // Create spheres
+   if (useParticles)
+   {
+      // Ensure that generation domain is computed correctly
+      WALBERLA_CHECK_FLOAT_EQUAL(simulationDomain.xMin(), real_t(0));
+      WALBERLA_CHECK_FLOAT_EQUAL(simulationDomain.yMin(), real_t(0));
+      WALBERLA_CHECK_FLOAT_EQUAL(simulationDomain.zMin(), real_t(0));
+
+      auto generationDomain = math::AABB::createFromMinMaxCorner(
+         math::Vector3< real_t >(simulationDomain.xMax() * (real_t(1) - generationDomainFraction[0]) / real_t(2),
+                                 simulationDomain.yMax() * (real_t(1) - generationDomainFraction[1]) / real_t(2),
+                                 simulationDomain.zMax() * (real_t(1) - generationDomainFraction[2]) / real_t(2)),
+         math::Vector3< real_t >(simulationDomain.xMax() * (real_t(1) + generationDomainFraction[0]) / real_t(2),
+                                 simulationDomain.yMax() * (real_t(1) + generationDomainFraction[1]) / real_t(2),
+                                 simulationDomain.zMax() * (real_t(1) + generationDomainFraction[2]) / real_t(2)));
+      real_t particleOffset = particleGenerationSpacing / real_t(2);
+      for (auto pt :
+           grid_generator::SCGrid(generationDomain, generationDomain.center() + generationPointOfReferenceOffset,
+                                  particleGenerationSpacing))
+      {
+         // Offset every second particle layer in flow direction to avoid channels in flow direction
+         if (useParticleOffset &&
+             uint_t(round(math::abs(generationDomain.center()[0] - pt[0]) / (particleGenerationSpacing))) % uint_t(2) !=
+                uint_t(0))
+         {
+            pt = pt + Vector3(real_t(0), particleOffset, particleOffset);
+         }
+         if (rpdDomain->isContainedInProcessSubdomain(uint_c(mpi::MPIManager::instance()->rank()), pt))
+         {
+            mesa_pd::data::Particle&& p = *ps->create();
+            p.setPosition(pt);
+            p.setInteractionRadius(particleDiameter * real_t(0.5));
+            p.setOwner(mpi::MPIManager::instance()->rank());
+            p.setShapeID(sphereShape);
+            p.setType(0);
+         }
+      }
+   }
+
+   ///////////////////////
+   // ADD DATA TO BLOCKS //
+   ////////////////////////
+
+   // Setting initial PDFs to nan helps to detect bugs in the initialization/BC handling
+   // Depending on WALBERLA_BUILD_WITH_GPU_SUPPORT, pdfFieldCPUGPUID is either a CPU or a CPU field
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   BlockDataID pdfFieldID =
+      field::addToStorage< PdfField_T >(blocks, "pdf field (fzyx)", real_c(std::nan("")), field::fzyx);
+   BlockDataID BFieldID         = field::addToStorage< BField_T >(blocks, "B field", 0, field::fzyx, 1);
+   BlockDataID pdfFieldCPUGPUID = gpu::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldID, "pdf field GPU");
+#else
+   BlockDataID pdfFieldCPUGPUID =
+      field::addToStorage< PdfField_T >(blocks, "pdf field CPU", real_c(std::nan("")), field::fzyx);
+#endif
+   BlockDataID densityFieldID = field::addToStorage< DensityField_T >(blocks, "density field", real_t(0), field::fzyx);
+   BlockDataID velFieldID  = field::addToStorage< VelocityField_T >(blocks, "velocity field", real_t(0), field::fzyx);
+   BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+
+   // Synchronize particles between the blocks for the correct mapping of ghost particles
+   mesa_pd::mpi::SyncNextNeighbors syncNextNeighborFunc;
+   syncNextNeighborFunc(*ps, *rpdDomain);
+
+   // Assemble boundary block string
+   std::string boundariesBlockString = " Boundaries"
+                                       "{"
+                                       "Border { direction W;    walldistance -1;  flag Inflow; }"
+                                       "Border { direction E;    walldistance -1;  flag Density; }";
+
+   if (!periodicInY)
+   {
+      boundariesBlockString += "Border { direction S;    walldistance -1;  flag NoSlip; }"
+                               "Border { direction N;    walldistance -1;  flag NoSlip; }";
+   }
+
+   if (!periodicInZ)
+   {
+      boundariesBlockString += "Border { direction T;    walldistance -1;  flag NoSlip; }"
+                               "Border { direction B;    walldistance -1;  flag NoSlip; }";
+   }
+
+   boundariesBlockString += "}";
+   WALBERLA_ROOT_SECTION()
+   {
+      std::ofstream boundariesFile("boundaries.prm");
+      boundariesFile << boundariesBlockString;
+      boundariesFile.close();
+   }
+   WALBERLA_MPI_BARRIER()
+
+   auto boundariesCfgFile = Config();
+   boundariesCfgFile.readParameterFile("boundaries.prm");
+   auto boundariesConfig = boundariesCfgFile.getBlock("Boundaries");
+
+   // map boundaries into the LBM simulation
+   geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, Fluid_Flag);
+   lbm::PSM_Density density_bc(blocks, pdfFieldCPUGPUID, real_t(1.0));
+   density_bc.fillFromFlagField< FlagField_T >(blocks, flagFieldID, Density_Flag, Fluid_Flag);
+   lbm::PSM_NoSlip noSlip(blocks, pdfFieldCPUGPUID);
+   noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, NoSlip_Flag, Fluid_Flag);
+   lbm::PSM_UBB ubb(blocks, pdfFieldCPUGPUID, uInflow, real_t(0), real_t(0));
+   ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldID, Inflow_Flag, Fluid_Flag);
+
+   ///////////////
+   // TIME LOOP //
+   ///////////////
+
+   // Map particles into the fluid domain
+   ParticleAndVolumeFractionSoA_T< Weighting > particleAndVolumeFractionSoA(blocks, relaxationRate);
+   PSMSweepCollection psmSweepCollection(blocks, accessor, lbm_mesapd_coupling::RegularParticlesSelector(),
+                                         particleAndVolumeFractionSoA, particleSubBlockSize);
+   if (useParticles)
+   {
+      for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+      {
+         psmSweepCollection.particleMappingSweep(&(*blockIt));
+      }
+   }
+
+   // Initialize PDFs
+   pystencils::InitializeDomainForPSM pdfSetter(
+      particleAndVolumeFractionSoA.BsFieldID, particleAndVolumeFractionSoA.BFieldID,
+      particleAndVolumeFractionSoA.particleVelocitiesFieldID, pdfFieldCPUGPUID, real_t(0), real_t(0), real_t(0),
+      real_t(1.0), real_t(0), real_t(0), real_t(0));
+
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      // pdfSetter requires particle velocities at cell centers
+      if (useParticles) { psmSweepCollection.setParticleVelocitiesSweep(&(*blockIt)); }
+      pdfSetter(&(*blockIt));
+   }
+
+   // Setup of the LBM communication for synchronizing the pdf field between neighboring blocks
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   gpu::communication::UniformGPUScheme< Stencil_T > com(blocks, sendDirectlyFromGPU, false);
+#else
+   walberla::blockforest::communication::UniformBufferedScheme< Stencil_T > com(blocks);
+#endif
+   com.addPackInfo(make_shared< PackInfo_T >(pdfFieldCPUGPUID));
+   auto communication = std::function< void() >([&]() { com.communicate(); });
+
+   SweepTimeloop commTimeloop(blocks->getBlockStorage(), timeSteps);
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timeSteps);
+
+   timeloop.addFuncBeforeTimeStep(RemainingTimeLogger(timeloop.getNrOfTimeSteps()), "Remaining Time Logger");
+
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   pystencils::PSM_MacroGetter getterSweep(BFieldID, densityFieldID, pdfFieldID, velFieldID, real_t(0.0), real_t(0.0),
+                                           real_t(0.0));
+#else
+   pystencils::PSM_MacroGetter getterSweep(particleAndVolumeFractionSoA.BFieldID, densityFieldID, pdfFieldCPUGPUID,
+                                           velFieldID, real_t(0.0), real_t(0.0), real_t(0.0));
+#endif
+   // VTK output
+   if (vtkSpacing != uint_t(0))
+   {
+      // Spheres
+      auto particleVtkOutput = make_shared< mesa_pd::vtk::ParticleVtkOutput >(ps);
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleUid >("uid");
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleLinearVelocity >("velocity");
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleInteractionRadius >("radius");
+      // Limit output to process-local spheres
+      particleVtkOutput->setParticleSelector([sphereShape](const mesa_pd::data::ParticleStorage::iterator& pIt) {
+         return pIt->getShapeID() == sphereShape &&
+                !(mesa_pd::data::particle_flags::isSet(pIt->getFlags(), mesa_pd::data::particle_flags::GHOST));
+      });
+      auto particleVtkWriter = vtk::createVTKOutput_PointData(particleVtkOutput, "particles", vtkSpacing, vtkFolder);
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(particleVtkWriter), "VTK (sphere data)");
+
+      // Fields
+      auto pdfFieldVTK = vtk::createVTKOutput_BlockData(blocks, "fluid", vtkSpacing, 0, false, vtkFolder);
+
+      pdfFieldVTK->addBeforeFunction(communication);
+
+      pdfFieldVTK->addBeforeFunction([&]() {
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+         gpu::fieldCpy< PdfField_T, gpu::GPUField< real_t > >(blocks, pdfFieldID, pdfFieldCPUGPUID);
+         gpu::fieldCpy< GhostLayerField< real_t, 1 >, BFieldGPU_T >(blocks, BFieldID,
+                                                                    particleAndVolumeFractionSoA.BFieldID);
+#endif
+         for (auto& block : *blocks)
+            getterSweep(&block);
+      });
+
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< VelocityField_T > >(velFieldID, "Velocity"));
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< DensityField_T > >(densityFieldID, "Density"));
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< BField_T > >(BFieldID, "OverlapFraction"));
+#else
+      pdfFieldVTK->addCellDataWriter(
+         make_shared< field::VTKWriter< BField_T > >(particleAndVolumeFractionSoA.BFieldID, "OverlapFraction"));
+#endif
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< FlagField_T > >(flagFieldID, "FlagField"));
+
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(pdfFieldVTK), "VTK (fluid field data)");
+   }
+
+   if (vtkSpacing != uint_t(0)) { vtk::writeDomainDecomposition(blocks, "domain_decomposition", vtkFolder); }
+
+   // Add performance logging
+   lbm::PerformanceLogger< FlagField_T > performanceLogger(blocks, flagFieldID, Fluid_Flag, performanceLogFrequency);
+   if (performanceLogFrequency > 0)
+   {
+      timeloop.addFuncAfterTimeStep(performanceLogger, "Evaluate performance logging");
+   }
+
+   // Add LBM communication function and boundary handling sweep
+   if (useCommunicationHiding)
+   {
+      timeloop.add() << Sweep(deviceSyncWrapper(density_bc.getSweep()), "Boundary Handling (Density)");
+   }
+   else
+   {
+      timeloop.add() << BeforeFunction(communication, "LBM Communication")
+                     << Sweep(deviceSyncWrapper(density_bc.getSweep()), "Boundary Handling (Density)");
+   }
+   timeloop.add() << Sweep(deviceSyncWrapper(ubb.getSweep()), "Boundary Handling (UBB)");
+   if (!periodicInY || !periodicInZ)
+   {
+      timeloop.add() << Sweep(deviceSyncWrapper(noSlip.getSweep()), "Boundary Handling (NoSlip)");
+   }
+
+   // PSM kernel
+   pystencils::PSMSweep PSMSweep(particleAndVolumeFractionSoA.BsFieldID, particleAndVolumeFractionSoA.BFieldID,
+                                 particleAndVolumeFractionSoA.particleForcesFieldID,
+                                 particleAndVolumeFractionSoA.particleVelocitiesFieldID, pdfFieldCPUGPUID, real_t(0.0),
+                                 real_t(0.0), real_t(0.0), relaxationRate);
+   pystencils::PSMSweepSplit PSMSplitSweep(
+      particleAndVolumeFractionSoA.BsFieldID, particleAndVolumeFractionSoA.BFieldID,
+      particleAndVolumeFractionSoA.particleForcesFieldID, particleAndVolumeFractionSoA.particleVelocitiesFieldID,
+      pdfFieldCPUGPUID, real_t(0.0), real_t(0.0), real_t(0.0), relaxationRate, frameWidth);
+   pystencils::LBMSweep LBMSweep(pdfFieldCPUGPUID, real_t(0.0), real_t(0.0), real_t(0.0), relaxationRate);
+   pystencils::LBMSplitSweep LBMSplitSweep(pdfFieldCPUGPUID, real_t(0.0), real_t(0.0), real_t(0.0), relaxationRate,
+                                           frameWidth);
+
+   if (useParticles)
+   {
+      if (useCommunicationHiding)
+      {
+         addPSMSweepsToTimeloops(commTimeloop, timeloop, com, psmSweepCollection, PSMSplitSweep);
+      }
+      else { addPSMSweepsToTimeloop(timeloop, psmSweepCollection, PSMSweep); }
+   }
+   else
+   {
+      if (useCommunicationHiding)
+      {
+         commTimeloop.add() << BeforeFunction([&]() { com.startCommunication(); }, "LBM Communication (start)")
+                            << Sweep(deviceSyncWrapper(LBMSplitSweep.getInnerSweep()), "LBM inner sweep")
+                            << AfterFunction([&]() { com.wait(); }, "LBM Communication (wait)");
+         timeloop.add() << Sweep(deviceSyncWrapper(LBMSplitSweep.getOuterSweep()), "LBM outer sweep");
+      }
+      else { timeloop.add() << Sweep(deviceSyncWrapper(LBMSweep), "LBM sweep"); }
+   }
+
+   WcTimingPool timeloopTiming;
+   // TODO: maybe add warmup phase
+   for (uint_t timeStep = 0; timeStep < timeSteps; ++timeStep)
+   {
+      if (useCommunicationHiding) { commTimeloop.singleStep(timeloopTiming); }
+      timeloop.singleStep(timeloopTiming);
+   }
+   timeloopTiming.logResultOnRoot();
+   auto timeloopTimingReduced = timeloopTiming.getReduced();
+
+   // Write parameters and performance results in sqlite database
+   WALBERLA_ROOT_SECTION()
+   {
+      // Use DB_FILE environment variable if set
+      std::string dbFile;
+      if (std::getenv("DB_FILE") != nullptr) { dbFile = std::getenv("DB_FILE"); }
+      else
+      {
+         if (useParticles) { dbFile = "percolation_benchmark.sqlite3"; }
+         else { dbFile = "channel_flow_benchmark.sqlite3"; }
+      }
+
+      std::map< std::string, int > integerProperties;
+      std::map< std::string, double > realProperties;
+      std::map< std::string, std::string > stringProperties;
+
+      integerProperties["numXBlocks"]                = int(numXBlocks);
+      integerProperties["numYBlocks"]                = int(numYBlocks);
+      integerProperties["numZBlocks"]                = int(numZBlocks);
+      integerProperties["numXCellsPerBlock"]         = int(numXCellsPerBlock);
+      integerProperties["numYCellsPerBlock"]         = int(numYCellsPerBlock);
+      integerProperties["numZCellsPerBlock"]         = int(numZCellsPerBlock);
+      integerProperties["timeSteps"]                 = int(timeSteps);
+      integerProperties["sendDirectlyFromGPU"]       = int(sendDirectlyFromGPU);
+      integerProperties["useCommunicationHiding"]    = int(useCommunicationHiding);
+      integerProperties["communicationHidingXWidth"] = int(frameWidth[0]);
+      integerProperties["communicationHidingYWidth"] = int(frameWidth[1]);
+      integerProperties["communicationHidingZWidth"] = int(frameWidth[2]);
+      integerProperties["useParticles"]              = int(useParticles);
+      integerProperties["numParticles"]              = int(ps->size());
+      integerProperties["particleSubBlockXSize"]     = int(particleSubBlockSize[0]);
+      integerProperties["particleSubBlockYSize"]     = int(particleSubBlockSize[1]);
+      integerProperties["particleSubBlockZSize"]     = int(particleSubBlockSize[2]);
+
+      realProperties["particleDiameter"]          = double(particleDiameter);
+      realProperties["particleGenerationSpacing"] = double(particleGenerationSpacing);
+
+      performanceLogger.getBestResultsForSQLOnRoot(integerProperties, realProperties, stringProperties);
+
+      auto runId = sqlite::storeRunInSqliteDB(dbFile, integerProperties, stringProperties, realProperties);
+      sqlite::storeTimingPoolInSqliteDB(dbFile, runId, *timeloopTimingReduced, "Timeloop");
+   }
+
+   return EXIT_SUCCESS;
+}
+
+} // namespace percolation
+
+int main(int argc, char** argv) { percolation::main(argc, argv); }
diff --git a/apps/benchmarks/Percolation/benchmark.prm b/apps/benchmarks/Percolation/benchmark.prm
new file mode 100644
index 0000000000000000000000000000000000000000..f401949b546987d81fd55c052f1aa91973c834ad
--- /dev/null
+++ b/apps/benchmarks/Percolation/benchmark.prm
@@ -0,0 +1,42 @@
+
+NumericalSetup
+{
+    // product of number of blocks should be equal to number of used processes
+    numXBlocks 1;
+    numYBlocks 1;
+    numZBlocks 1;
+
+    periodicInY false;
+    periodicInZ false;
+
+    numXCellsPerBlock 256;
+    numYCellsPerBlock 128;
+    numZCellsPerBlock 128;
+
+    timeSteps 100;
+
+    sendDirectlyFromGPU false; // use GPU-GPU communication
+    useCommunicationHiding false;
+    frameWidth <1, 1, 1>; // width of the outer region if splitting the LBM/PSM into inner and outer (only used if useCommunicationHiding is true)
+
+    // particle distribution in LBM units
+    useParticles true; // if true, PSM/particle mapping/velocity computation/hydrodynamic force reduction is used, else LBM is used
+    particleDiameter 20.0;
+    particleGenerationSpacing 21.0;
+    generationDomainFraction <0.8, 1.0, 1.0>; // fraction of the domain where particles are generated
+    generationPointOfReferenceOffset <0, 0, 0>; // offset of point of reference from domain center, see SCIterator.h
+    useParticleOffset true; // offset every second particle layer in flow direction
+    particleSubBlockSize <8, 8, 8>;
+
+    // fluid quantities in LBM units
+    uInflow 0.00008;
+    relaxationRate 0.9;
+}
+
+Output
+{
+    vtkSpacing 0;
+    vtkFolder vtk_out;
+
+    performanceLogFrequency 100;
+}
diff --git a/apps/showcases/CMakeLists.txt b/apps/showcases/CMakeLists.txt
index f601fdd24ebb1454ae518db8405be992f9f0be57..d845f5271c267d684793a6c795c49a558b323cd7 100644
--- a/apps/showcases/CMakeLists.txt
+++ b/apps/showcases/CMakeLists.txt
@@ -7,6 +7,7 @@ add_subdirectory( LightRisingParticleInFluidAMR )
 add_subdirectory( Mixer )
 add_subdirectory( ParticlePacking )
 add_subdirectory( PegIntoSphereBed )
+add_subdirectory( Piping )
 if ( WALBERLA_BUILD_WITH_CODEGEN)
 
    add_subdirectory( Antidunes )
diff --git a/apps/showcases/Piping/CMakeLists.txt b/apps/showcases/Piping/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c86a1ee331e80325e60c529c37cc59c9f8871078
--- /dev/null
+++ b/apps/showcases/Piping/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_subdirectory(setups)
+
+waLBerla_add_executable(NAME SettlingSpheres
+        FILES SettlingSpheres.cpp
+        DEPENDS blockforest core field lbm_mesapd_coupling mesa_pd vtk)
diff --git a/apps/showcases/Piping/SettlingSpheres.cpp b/apps/showcases/Piping/SettlingSpheres.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d2477f0fb0716f2521f271c6eb4c309bed166baa
--- /dev/null
+++ b/apps/showcases/Piping/SettlingSpheres.cpp
@@ -0,0 +1,300 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file   SettlingSpheres.cpp
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//! \brief Based on showcases/Antidunes/BedGeneration.cpp
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+
+#include "core/Environment.h"
+#include "core/grid_generator/SCIterator.h"
+#include "core/math/Random.h"
+#include "core/mpi/Reduce.h"
+
+#include "mesa_pd/collision_detection/AnalyticContactDetection.h"
+#include "mesa_pd/data/DataTypes.h"
+#include "mesa_pd/data/LinkedCells.h"
+#include "mesa_pd/data/ParticleAccessorWithBaseShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/shape/Sphere.h"
+#include "mesa_pd/domain/BlockForestDomain.h"
+#include "mesa_pd/kernel/AssocToBlock.h"
+#include "mesa_pd/kernel/DoubleCast.h"
+#include "mesa_pd/kernel/InsertParticleIntoLinkedCells.h"
+#include "mesa_pd/kernel/LinearSpringDashpot.h"
+#include "mesa_pd/kernel/ParticleSelector.h"
+#include "mesa_pd/kernel/VelocityVerlet.h"
+#include "mesa_pd/mpi/ContactFilter.h"
+#include "mesa_pd/mpi/ReduceContactHistory.h"
+#include "mesa_pd/mpi/ReduceProperty.h"
+#include "mesa_pd/mpi/SyncNextNeighbors.h"
+#include "mesa_pd/mpi/notifications/ForceTorqueNotification.h"
+#include "mesa_pd/vtk/ParticleVtkOutput.h"
+
+#include "vtk/VTKOutput.h"
+
+#include "utility/ParticleUtility.h"
+
+namespace walberla
+{
+namespace piping
+{
+
+using namespace mesa_pd;
+
+int main(int argc, char** argv)
+{
+   Environment env(argc, argv);
+   walberla::mpi::MPIManager::instance()->useWorldComm();
+
+   // Config
+   auto cfg = env.config();
+   if (cfg == nullptr) WALBERLA_ABORT("No config specified!");
+   WALBERLA_LOG_INFO_ON_ROOT(*cfg);
+   const Config::BlockHandle bedGenerationConf = cfg->getBlock("BedGeneration");
+
+   const Vec3 domainSize_SI    = bedGenerationConf.getParameter< Vec3 >("domainSize_SI");
+   const Vector3< int > blocks = bedGenerationConf.getParameter< Vector3< int > >("blocks");
+   WALBERLA_CHECK_EQUAL(blocks[0] * blocks[1] * blocks[2], uint_t(MPIManager::instance()->numProcesses()),
+                        "The number of blocks (" << blocks[0] * blocks[1] * blocks[2]
+                                                 << ") has to match the number of MPI processes ("
+                                                 << uint_t(MPIManager::instance()->numProcesses()) << ")");
+   const bool periodicInX = bedGenerationConf.getParameter< bool >("periodicInX");
+   const bool periodicInY = bedGenerationConf.getParameter< bool >("periodicInY");
+   if ((periodicInX && blocks[0] == 1) || (periodicInY && blocks[1] == 1))
+   {
+      WALBERLA_ABORT("The number of blocks in periodic dimensions must be greater than 1.")
+   }
+   const real_t minDiameter_SI         = bedGenerationConf.getParameter< real_t >("minDiameter_SI");
+   const real_t maxDiameter_SI         = bedGenerationConf.getParameter< real_t >("maxDiameter_SI");
+   const real_t gravity_SI             = bedGenerationConf.getParameter< real_t >("gravity_SI");
+   const real_t densityFluid_SI        = bedGenerationConf.getParameter< real_t >("densityFluid_SI");
+   const real_t densityParticle_SI     = bedGenerationConf.getParameter< real_t >("densityParticle_SI");
+   const real_t generationSpacing_SI   = bedGenerationConf.getParameter< real_t >("generationSpacing_SI");
+   const real_t initialVelocity_SI     = bedGenerationConf.getParameter< real_t >("initialVelocity_SI");
+   const real_t dt_SI                  = bedGenerationConf.getParameter< real_t >("dt_SI");
+   const real_t frictionCoefficient    = bedGenerationConf.getParameter< real_t >("frictionCoefficient");
+   const real_t restitutionCoefficient = bedGenerationConf.getParameter< real_t >("restitutionCoefficient");
+   const real_t collisionTime_SI       = bedGenerationConf.getParameter< real_t >("collisionTime_SI");
+   const real_t poissonsRatio          = bedGenerationConf.getParameter< real_t >("poissonsRatio");
+   const uint_t timeSteps              = bedGenerationConf.getParameter< uint_t >("timeSteps");
+   const uint_t visSpacing             = bedGenerationConf.getParameter< uint_t >("visSpacing");
+   const std::string outFileName       = bedGenerationConf.getParameter< std::string >("outFileName");
+
+   bool useOpenMP = false;
+
+   // BlockForest
+   const math::AABB simulationDomain_SI(real_t(0.0), real_t(0.0), real_t(0.0), domainSize_SI[0], domainSize_SI[1],
+                                        domainSize_SI[2]);
+   const Vector3< bool > isPeriodic{ periodicInX, periodicInY, false };
+
+   shared_ptr< BlockForest > forest = blockforest::createBlockForest(simulationDomain_SI, blocks, isPeriodic);
+   auto domain                      = std::make_shared< mesa_pd::domain::BlockForestDomain >(forest);
+
+   // MesaPD data structures
+   auto ps = std::make_shared< data::ParticleStorage >(1);
+   data::ParticleAccessorWithBaseShape accessor(ps);
+
+   // Init spheres
+   // Use offset to domain boundary to prevent the spheres from touching in the beginning
+   const real_t domainOffset = maxDiameter_SI / real_t(2);
+   const math::AABB generationDomain_SI(
+      simulationDomain_SI.xMin() + domainOffset, simulationDomain_SI.yMin() + domainOffset,
+      simulationDomain_SI.zMin() + domainOffset, simulationDomain_SI.xMax() - domainOffset,
+      simulationDomain_SI.yMax() - domainOffset, simulationDomain_SI.zMax() - domainOffset);
+   math::seedRandomGenerator(42);
+
+   for (auto pt :
+        grid_generator::SCGrid(generationDomain_SI, Vec3(generationSpacing_SI) * real_c(0.5), generationSpacing_SI))
+   {
+      auto diameter = math::realRandom< real_t >(minDiameter_SI, maxDiameter_SI);
+
+      if (!domain->isContainedInLocalSubdomain(pt, real_t(0))) continue;
+      auto p                       = ps->create();
+      p->getPositionRef()          = pt;
+      p->getInteractionRadiusRef() = diameter * real_t(0.5);
+      p->getBaseShapeRef()         = std::make_shared< data::Sphere >(p->getInteractionRadius());
+      p->getBaseShapeRef()->updateMassAndInertia(densityParticle_SI);
+
+      p->setLinearVelocity(Vec3(real_t(0.1) * math::realRandom(-initialVelocity_SI, initialVelocity_SI),
+                                real_t(0.1) * math::realRandom(-initialVelocity_SI, initialVelocity_SI),
+                                -initialVelocity_SI));
+      p->getOwnerRef() = walberla::mpi::MPIManager::instance()->rank();
+      p->getTypeRef()  = 0;
+   }
+
+   uint_t numParticles = ps->size();
+   walberla::mpi::reduceInplace(numParticles, walberla::mpi::SUM);
+
+   createPlane(*ps, simulationDomain_SI.minCorner(), Vec3(real_t(0), real_t(0), real_t(1)));
+   createPlane(*ps, simulationDomain_SI.maxCorner(), Vec3(real_t(0), real_t(0), real_t(-1)));
+
+   if (!isPeriodic[0])
+   {
+      createPlane(*ps, simulationDomain_SI.minCorner(), Vector3< real_t >(1, 0, 0));
+      createPlane(*ps, simulationDomain_SI.maxCorner(), Vector3< real_t >(-1, 0, 0));
+   }
+   if (!isPeriodic[1])
+   {
+      createPlane(*ps, simulationDomain_SI.minCorner(), Vector3< real_t >(0, 1, 0));
+      createPlane(*ps, simulationDomain_SI.maxCorner(), Vector3< real_t >(0, -1, 0));
+   }
+
+   // VTK
+   auto vtkDomainOutput = walberla::vtk::createVTKOutput_DomainDecomposition(forest, "domain_decomposition", 1,
+                                                                             "vtk_settling_spheres", "simulation_step");
+   if (visSpacing > 0) { vtkDomainOutput->write(); }
+
+   auto particleVtkOutput = make_shared< mesa_pd::vtk::ParticleVtkOutput >(ps);
+   particleVtkOutput->addOutput< mesa_pd::data::SelectParticleLinearVelocity >("velocity");
+   particleVtkOutput->addOutput< mesa_pd::data::SelectParticleInteractionRadius >("radius");
+   particleVtkOutput->setParticleSelector([](const data::ParticleStorage::iterator& pIt) {
+      using namespace walberla::mesa_pd::data::particle_flags;
+      return (pIt->getBaseShape()->getShapeType() == data::Sphere::SHAPE_TYPE) && !isSet(pIt->getFlags(), GHOST);
+   });
+   auto vtkWriter = walberla::vtk::createVTKOutput_PointData(particleVtkOutput, "Particles", 1, "vtk_settling_spheres",
+                                                             "simulation_step", false, false);
+
+   // Init kernels
+   mesa_pd::kernel::VelocityVerletPreForceUpdate vvIntegratorPreForce(dt_SI);
+   mesa_pd::kernel::VelocityVerletPostForceUpdate vvIntegratorPostForce(dt_SI);
+   kernel::LinearSpringDashpot dem(2);
+   dem.setFrictionCoefficientDynamic(0, 0, frictionCoefficient);
+   // Use friction between spheres and planes to speed up the settling
+   dem.setFrictionCoefficientDynamic(0, 1, frictionCoefficient);
+   real_t kappa = real_t(2) * (real_t(1) - poissonsRatio) / (real_t(2) - poissonsRatio); // from Thornton et al
+
+   kernel::AssocToBlock assoc(forest);
+   mesa_pd::mpi::ReduceProperty RP;
+   mesa_pd::mpi::ReduceContactHistory reduceAndSwapContactHistory;
+   mesa_pd::mpi::SyncNextNeighbors SNN;
+
+   ps->forEachParticle(useOpenMP, kernel::SelectLocal(), accessor, assoc, accessor);
+
+   // initial sync
+   SNN(*ps, *domain);
+
+   real_t linkedCellWidth = 1.01_r * maxDiameter_SI;
+   data::LinkedCells linkedCells(domain->getUnionOfLocalAABBs().getExtended(linkedCellWidth), linkedCellWidth);
+   kernel::InsertParticleIntoLinkedCells ipilc;
+
+   WcTimer timer;
+   WcTimingPool timeloopTiming;
+   timer.start();
+   for (uint_t i = 0; i < timeSteps; ++i)
+   {
+      if (visSpacing > 0 && i % visSpacing == 0) { vtkWriter->write(); }
+
+      timeloopTiming["RPD forEachParticle assoc"].start();
+      ps->forEachParticle(useOpenMP, kernel::SelectLocal(), accessor, assoc, accessor);
+      timeloopTiming["RPD forEachParticle assoc"].end();
+
+      timeloopTiming["RPD forEachParticle vvIntegratorPreForce"].start();
+      ps->forEachParticle(useOpenMP, kernel::SelectLocal(), accessor, vvIntegratorPreForce, accessor);
+      timeloopTiming["RPD forEachParticle vvIntegratorPreForce"].end();
+
+      timeloopTiming["SNN"].start();
+      SNN(*ps, *domain);
+      timeloopTiming["SNN"].end();
+
+      // gravity - buoyancy
+      timeloopTiming["RPD forEachParticle addGravitationalForce"].start();
+      ps->forEachParticle(
+         useOpenMP, kernel::SelectLocal(), accessor,
+         [densityParticle_SI, densityFluid_SI, gravity_SI](const size_t idx, auto& ac) {
+            mesa_pd::addForceAtomic(
+               idx, ac, Vec3(0, 0, -(densityParticle_SI - densityFluid_SI) * ac.getVolume(idx) * gravity_SI));
+         },
+         accessor);
+      timeloopTiming["RPD forEachParticle addGravitationalForce"].end();
+
+      timeloopTiming["RPD linkedCells.clear"].start();
+      linkedCells.clear();
+      timeloopTiming["RPD linkedCells.clear"].end();
+      timeloopTiming["RPD forEachParticle ipilc"].start();
+      ps->forEachParticle(useOpenMP, kernel::SelectAll(), accessor, ipilc, accessor, linkedCells);
+      timeloopTiming["RPD forEachParticle ipilc"].end();
+      timeloopTiming["RPD forEachParticlePairHalf dem"].start();
+      linkedCells.forEachParticlePairHalf(
+         useOpenMP, kernel::ExcludeInfiniteInfinite(), accessor,
+         [restitutionCoefficient, collisionTime_SI, kappa, domain, &dem, dt_SI](const size_t idx1, const size_t idx2,
+                                                                                auto& ac) {
+            kernel::DoubleCast double_cast;
+            mesa_pd::mpi::ContactFilter contact_filter;
+            collision_detection::AnalyticContactDetection acd;
+
+            if (double_cast(idx1, idx2, ac, acd, ac))
+            {
+               if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *domain))
+               {
+                  auto meff = real_t(1) / (ac.getInvMass(idx1) + ac.getInvMass(idx2));
+                  dem.setStiffnessAndDamping(ac.getType(idx1), ac.getType(idx2), restitutionCoefficient,
+                                             collisionTime_SI, kappa, meff);
+                  dem(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), acd.getContactNormal(),
+                      acd.getPenetrationDepth(), dt_SI);
+               }
+            }
+         },
+         accessor);
+      timeloopTiming["RPD forEachParticlePairHalf dem"].end();
+
+      timeloopTiming["RPD reduceProperty reduceAndSwapContactHistory"].start();
+      reduceAndSwapContactHistory(*ps);
+      timeloopTiming["RPD reduceProperty reduceAndSwapContactHistory"].end();
+
+      timeloopTiming["RPD reduceProperty ForceTorqueNotification"].start();
+      RP.operator()< ForceTorqueNotification >(*ps);
+      timeloopTiming["RPD reduceProperty ForceTorqueNotification"].end();
+
+      timeloopTiming["RPD forEachParticle vvIntegratorPostForce"].start();
+      ps->forEachParticle(useOpenMP, kernel::SelectLocal(), accessor, vvIntegratorPostForce, accessor);
+      timeloopTiming["RPD forEachParticle vvIntegratorPostForce"].end();
+
+      // Log particle velocities every 10% of progress. Turn logging off for benchmark run (i.e., no vtk output).
+      if (i % (timeSteps / uint_t(10)) == 0 && visSpacing != 0)
+      {
+         real_t maxVelocity;
+         real_t averageVelocity;
+         uint_t numAveragedParticles;
+
+         getParticleVelocities(accessor, numAveragedParticles, maxVelocity, averageVelocity);
+         WALBERLA_LOG_INFO_ON_ROOT("Timestep " << i << " / " << timeSteps << ", average velocity = " << averageVelocity
+                                               << ", max velocity = " << maxVelocity
+                                               << ", #particles = " << numAveragedParticles);
+      }
+   }
+   timer.end();
+
+   auto timer_reduced = walberla::timing::getReduced(timer, timing::REDUCE_TOTAL, 0);
+   WALBERLA_ROOT_SECTION()
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(*timer_reduced);
+      real_t PUpS = real_t(numParticles) * real_t(timeSteps) / real_t(timer_reduced->max());
+      WALBERLA_LOG_INFO_ON_ROOT("PUpS: " << PUpS);
+   }
+
+   timeloopTiming.logResultOnRoot();
+
+   writeSphereInformationToFile(outFileName, *ps, domainSize_SI);
+
+   return EXIT_SUCCESS;
+}
+} // namespace piping
+} // namespace walberla
+
+int main(int argc, char** argv) { return walberla::piping::main(argc, argv); }
diff --git a/apps/showcases/Piping/setups/CMakeLists.txt b/apps/showcases/Piping/setups/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..77d65d222e15d50cbd546143cb68672ae8483946
--- /dev/null
+++ b/apps/showcases/Piping/setups/CMakeLists.txt
@@ -0,0 +1 @@
+waLBerla_link_files_to_builddir(*.prm)
diff --git a/apps/showcases/Piping/setups/SettlingSpheres_Fukumoto.prm b/apps/showcases/Piping/setups/SettlingSpheres_Fukumoto.prm
new file mode 100644
index 0000000000000000000000000000000000000000..92a9ac319fd303fa95d4293411a6050a4c6d80ae
--- /dev/null
+++ b/apps/showcases/Piping/setups/SettlingSpheres_Fukumoto.prm
@@ -0,0 +1,21 @@
+BedGeneration{
+    domainSize_SI < 0.015, 0.003, 0.015 >; // for Fukumoto: < 0.2, 0.003, 0.133 > (results in bed height of ~0.05)
+    blocks < 3, 3, 1 >;
+    periodicInX false;
+    periodicInY false;
+    minDiameter_SI 0.0005;
+    maxDiameter_SI 0.0006;
+    gravity_SI 9.81;
+    densityFluid_SI 1000;
+    densityParticle_SI 2500;
+    generationSpacing_SI 0.0007;
+    initialVelocity_SI 0.05;
+    dt_SI 5e-5; // decrease dt_SI to get lower particle velocity in the end
+    frictionCoefficient 0.6;
+    restitutionCoefficient 0.1; // artificial low value to decrease the rebound (increase settling speed)
+    collisionTime_SI 5e-4; // time to resolve a collision
+    poissonsRatio 0.22;
+    timeSteps 10000;
+    visSpacing 100;
+    outFileName spheres_out.dat;
+}
diff --git a/apps/showcases/Piping/utility/ParticleUtility.h b/apps/showcases/Piping/utility/ParticleUtility.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9b223efcf8328fa8cef1cb47f24ec647914fbbf
--- /dev/null
+++ b/apps/showcases/Piping/utility/ParticleUtility.h
@@ -0,0 +1,306 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file   ParticleUtility.h
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "core/mpi/Broadcast.h"
+#include "core/mpi/MPITextFile.h"
+#include "core/mpi/Reduce.h"
+
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/shape/Sphere.h"
+
+namespace walberla
+{
+namespace piping
+{
+
+// Some functions in this file (as the one below) are based on showcases/Antidunes/Utility.cpp
+
+void writeSphereInformationToFile(const std::string& filename, walberla::mesa_pd::data::ParticleStorage& ps,
+                                  const Vector3< real_t >& domainSize, const int precision = 12)
+{
+   std::ostringstream ossData;
+   ossData << std::setprecision(precision);
+
+   WALBERLA_ROOT_SECTION() { ossData << domainSize[0] << " " << domainSize[1] << " " << domainSize[2] << "\n"; }
+
+   for (auto pIt : ps)
+   {
+      using namespace walberla::mesa_pd::data;
+      if (pIt->getBaseShape()->getShapeType() != Sphere::SHAPE_TYPE) continue;
+      using namespace walberla::mesa_pd::data::particle_flags;
+      if (isSet(pIt->getFlags(), GHOST)) continue;
+      auto sp = static_cast< Sphere* >(pIt->getBaseShape().get());
+
+      auto position = pIt->getPosition();
+
+      ossData << position[0] << " " << position[1] << " " << position[2] << " " << sp->getRadius() << '\n';
+   }
+
+   walberla::mpi::writeMPITextFile(filename, ossData.str());
+}
+
+bool sphereBoxOverlap(const mesa_pd::Vec3& spherePosition, const real_t sphereRadius, const mesa_pd::Vec3& boxPosition,
+                      const mesa_pd::Vec3& boxEdgeLength)
+{
+   if ((spherePosition[0] + sphereRadius < boxPosition[0] - boxEdgeLength[0] / real_t(2)) ||
+       (spherePosition[1] + sphereRadius < boxPosition[1] - boxEdgeLength[1] / real_t(2)) ||
+       (spherePosition[2] + sphereRadius < boxPosition[2] - boxEdgeLength[2] / real_t(2)) ||
+       (spherePosition[0] - sphereRadius > boxPosition[0] + boxEdgeLength[0] / real_t(2)) ||
+       (spherePosition[1] - sphereRadius > boxPosition[1] + boxEdgeLength[1] / real_t(2)) ||
+       (spherePosition[2] - sphereRadius > boxPosition[2] + boxEdgeLength[2] / real_t(2)))
+   {
+      return false;
+   }
+   return true;
+}
+
+void initSpheresFromFile(const std::string& fileName, walberla::mesa_pd::data::ParticleStorage& ps,
+                         const walberla::mesa_pd::domain::IDomain& domain, walberla::real_t particleDensity,
+                         math::AABB& simulationDomain, const Vector3< uint_t >& domainSize,
+                         const mesa_pd::Vec3& boxPosition, const mesa_pd::Vec3& boxEdgeLength, real_t& maxDiameter)
+{
+   using namespace walberla::mesa_pd::data;
+
+   auto rank = walberla::mpi::MPIManager::instance()->rank();
+
+   std::string textFile;
+
+   WALBERLA_ROOT_SECTION()
+   {
+      std::ifstream t(fileName.c_str());
+      if (!t) { WALBERLA_ABORT("Invalid input file " << fileName << "\n"); }
+      std::stringstream buffer;
+      buffer << t.rdbuf();
+      textFile = buffer.str();
+   }
+
+   walberla::mpi::broadcastObject(textFile);
+
+   std::istringstream fileIss(textFile);
+   std::string line;
+
+   // first line contains generation domain sizes
+   std::getline(fileIss, line);
+   Vector3< real_t > generationDomainSize_SI(0_r);
+   std::istringstream firstLine(line);
+   firstLine >> generationDomainSize_SI[0] >> generationDomainSize_SI[1] >> generationDomainSize_SI[2];
+   real_t scalingFactor = real_t(domainSize[0]) / generationDomainSize_SI[0];
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(generationDomainSize_SI)
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(scalingFactor)
+
+   real_t minParticleDiameter = std::numeric_limits< real_t >::max();
+   real_t maxParticleDiameter = real_t(0);
+
+   while (std::getline(fileIss, line))
+   {
+      std::istringstream iss(line);
+
+      ParticleStorage::position_type position;
+      real_t radius;
+      iss >> position[0] >> position[1] >> position[2] >> radius;
+      position *= scalingFactor;
+      radius *= scalingFactor;
+
+      WALBERLA_CHECK(simulationDomain.contains(position),
+                     "Particle read from file is not contained in simulation domain");
+
+      if (!domain.isContainedInProcessSubdomain(uint_c(rank), position)) continue;
+      if (sphereBoxOverlap(position, radius, boxPosition, boxEdgeLength)) continue;
+
+      auto pIt = ps.create();
+      pIt->setPosition(position);
+      pIt->getBaseShapeRef() = std::make_shared< data::Sphere >(radius);
+      pIt->getBaseShapeRef()->updateMassAndInertia(particleDensity);
+      pIt->setInteractionRadius(radius);
+      pIt->setOwner(rank);
+      pIt->setType(0);
+
+      minParticleDiameter = std::min(real_t(2) * radius, minParticleDiameter);
+      maxParticleDiameter = std::max(real_t(2) * radius, maxParticleDiameter);
+
+      WALBERLA_CHECK_EQUAL(iss.tellg(), -1);
+   }
+
+   WALBERLA_MPI_SECTION() { walberla::mpi::allReduceInplace(minParticleDiameter, walberla::mpi::MIN); }
+   WALBERLA_MPI_SECTION() { walberla::mpi::allReduceInplace(maxParticleDiameter, walberla::mpi::MAX); }
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(minParticleDiameter)
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(maxParticleDiameter)
+   // Maximum particle diameter is used for the size of the linked cells
+   maxDiameter = maxParticleDiameter;
+}
+
+template< typename ParticleAccessor_T >
+void getParticleVelocities(const ParticleAccessor_T& ac, uint_t& numParticles, real_t& maxVelocity,
+                           real_t& averageVelocity)
+{
+   maxVelocity     = real_t(0);
+   averageVelocity = real_t(0);
+   numParticles    = uint_t(0);
+
+   for (uint_t i = 0; i < ac.size(); ++i)
+   {
+      if (isSet(ac.getFlags(i), walberla::mesa_pd::data::particle_flags::GHOST)) continue;
+      if (isSet(ac.getFlags(i), walberla::mesa_pd::data::particle_flags::GLOBAL)) continue;
+
+      ++numParticles;
+      real_t velMagnitude = ac.getLinearVelocity(i).length();
+      maxVelocity         = std::max(maxVelocity, velMagnitude);
+      averageVelocity += velMagnitude;
+   }
+
+   WALBERLA_MPI_SECTION()
+   {
+      walberla::mpi::allReduceInplace(maxVelocity, walberla::mpi::MAX);
+      walberla::mpi::allReduceInplace(averageVelocity, walberla::mpi::SUM);
+      walberla::mpi::allReduceInplace(numParticles, walberla::mpi::SUM);
+   }
+
+   averageVelocity /= real_t(numParticles);
+}
+
+auto createPlane(mesa_pd::data::ParticleStorage& ps, const mesa_pd::Vec3& pos, const mesa_pd::Vec3& normal)
+{
+   auto p0 = ps.create(true);
+   p0->setPosition(pos);
+   p0->setBaseShape(std::make_shared< mesa_pd::data::HalfSpace >(normal));
+   // Mass is set to infinity internally for HalfSpace (independent of the density that is set here)
+   p0->getBaseShapeRef()->updateMassAndInertia(real_t(1));
+   p0->setOwner(walberla::mpi::MPIManager::instance()->rank());
+   p0->setType(1);
+   p0->setInteractionRadius(std::numeric_limits< real_t >::infinity());
+   mesa_pd::data::particle_flags::set(p0->getFlagsRef(), mesa_pd::data::particle_flags::GLOBAL);
+   mesa_pd::data::particle_flags::set(p0->getFlagsRef(), mesa_pd::data::particle_flags::INFINITE);
+   mesa_pd::data::particle_flags::set(p0->getFlagsRef(), mesa_pd::data::particle_flags::FIXED);
+   mesa_pd::data::particle_flags::set(p0->getFlagsRef(), mesa_pd::data::particle_flags::NON_COMMUNICATING);
+   return p0;
+}
+
+auto createBox(mesa_pd::data::ParticleStorage& ps, const mesa_pd::Vec3& pos, const mesa_pd::Vec3& edgeLength,
+               const bool movingBucket)
+{
+   auto p0 = ps.create(true);
+   p0->setPosition(pos);
+   p0->setBaseShape(std::make_shared< mesa_pd::data::Box >(edgeLength));
+   if (movingBucket)
+   {
+      // TODO: replace the density of 2.0
+      p0->getBaseShapeRef()->updateMassAndInertia(real_t(2.0));
+   }
+   else
+   {
+      // If the bucket is fixed, its collision behaviour should be the same as for the bounding planes
+      p0->getBaseShapeRef()->updateMassAndInertia(std::numeric_limits< real_t >::infinity());
+   }
+   p0->setOwner(walberla::mpi::MPIManager::instance()->rank());
+   p0->setType(1);
+   p0->setInteractionRadius(std::numeric_limits< real_t >::infinity());
+   mesa_pd::data::particle_flags::set(p0->getFlagsRef(), mesa_pd::data::particle_flags::GLOBAL);
+   mesa_pd::data::particle_flags::set(p0->getFlagsRef(), mesa_pd::data::particle_flags::INFINITE);
+   return p0->getUid();
+}
+
+template< typename ParticleAccessor_T, typename Sync_T, typename CollisionResponse_T >
+void settleParticles(const uint_t numTimeSteps, const shared_ptr< ParticleAccessor_T >& accessor,
+                     const shared_ptr< mesa_pd::data::ParticleStorage >& ps,
+                     const walberla::mesa_pd::domain::IDomain& domain, mesa_pd::data::LinkedCells& linkedCells,
+                     Sync_T& syncNextNeighborFunc, CollisionResponse_T& collisionResponse,
+                     const real_t& particleDensityRatio, const real_t& gravitationalAcceleration, const bool& useOpenMP)
+{
+   // Increase the settling speed
+   const real_t timeStepSizeParticles = real_t(10);
+   mesa_pd::kernel::VelocityVerletPreForceUpdate vvIntegratorPreForce(timeStepSizeParticles);
+   mesa_pd::kernel::VelocityVerletPostForceUpdate vvIntegratorPostForce(timeStepSizeParticles);
+   mesa_pd::mpi::ReduceProperty reduceProperty;
+   mesa_pd::mpi::ReduceContactHistory reduceAndSwapContactHistory;
+   mesa_pd::kernel::InsertParticleIntoLinkedCells ipilc;
+
+   WALBERLA_LOG_INFO_ON_ROOT("Starting initial particle settling...")
+
+   for (uint_t t = uint_t(0); t < numTimeSteps; ++t)
+   {
+      ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, vvIntegratorPreForce, *accessor);
+      syncNextNeighborFunc(*ps, domain);
+
+      linkedCells.clear();
+      ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectAll(), *accessor, ipilc, *accessor, linkedCells);
+
+      // collision
+      linkedCells.forEachParticlePairHalf(
+         useOpenMP, mesa_pd::kernel::ExcludeInfiniteInfinite(), *accessor,
+         [&collisionResponse, &domain, &timeStepSizeParticles](const size_t idx1, const size_t idx2, auto& ac) {
+            mesa_pd::collision_detection::AnalyticContactDetection acd;
+            mesa_pd::kernel::DoubleCast double_cast;
+            mesa_pd::mpi::ContactFilter contact_filter;
+            if (double_cast(idx1, idx2, ac, acd, ac))
+            {
+               if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), domain))
+               {
+                  collisionResponse(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), acd.getContactNormal(),
+                                    acd.getPenetrationDepth(), timeStepSizeParticles);
+               }
+            }
+         },
+         *accessor);
+      reduceAndSwapContactHistory(*ps);
+
+      // gravity - buoyancy
+      ps->forEachParticle(
+         useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor,
+         [particleDensityRatio, gravitationalAcceleration](const size_t idx, auto& ac) {
+            mesa_pd::addForceAtomic(
+               idx, ac,
+               Vector3< real_t >(real_t(0), real_t(0),
+                                 -(particleDensityRatio - real_c(1)) * ac.getVolume(idx) * gravitationalAcceleration));
+         },
+         *accessor);
+
+      reduceProperty.operator()< mesa_pd::ForceTorqueNotification >(*ps);
+
+      ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, vvIntegratorPostForce, *accessor);
+      syncNextNeighborFunc(*ps, domain);
+
+      if (t % (numTimeSteps / uint_t(10)) == 0)
+      {
+         real_t maxVelocity;
+         real_t averageVelocity;
+         uint_t numAveragedParticles;
+
+         getParticleVelocities(*accessor, numAveragedParticles, maxVelocity, averageVelocity);
+         WALBERLA_LOG_INFO_ON_ROOT("Timestep "
+                                   << t << " / " << numTimeSteps << ", average velocity = " << averageVelocity
+                                   << ", max velocity = " << maxVelocity << ", #particles = " << numAveragedParticles);
+      }
+   }
+
+   // Velocities should be 0 after settling such that the simulation starts from rest
+   ps->forEachParticle(
+      useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor,
+      [](const size_t idx, auto& ac) {
+         ac.setLinearVelocity(idx, Vector3(real_t(0)));
+         ac.setAngularVelocity(idx, Vector3(real_t(0)));
+      },
+      *accessor);
+   syncNextNeighborFunc(*ps, domain);
+}
+
+} // namespace piping
+} // namespace walberla
diff --git a/python/mesa_pd.py b/python/mesa_pd.py
index adbd847e84d7880afa67fd3006a0405c6c8171f0..18c01edbcefb75dc6994fb0a6f6a67fe74dc8465 100755
--- a/python/mesa_pd.py
+++ b/python/mesa_pd.py
@@ -23,6 +23,7 @@ if __name__ == '__main__':
     ps.add_property("invMass", "walberla::real_t", defValue="real_t(1)", syncMode="ON_GHOST_CREATION")
     ps.add_property("force", "walberla::mesa_pd::Vec3", defValue="real_t(0)", syncMode="NEVER")
     ps.add_property("oldForce", "walberla::mesa_pd::Vec3", defValue="real_t(0)", syncMode="ON_OWNERSHIP_CHANGE")
+    ps.add_property("charge", "walberla::real_t", defValue="real_t(0)", syncMode="ALWAYS")
 
     # shape definition for cases with small number of different shapes
     ps.add_property("shapeID", "size_t", defValue="", syncMode="ON_GHOST_CREATION")
@@ -70,6 +71,12 @@ if __name__ == '__main__':
                     syncMode="ON_OWNERSHIP_CHANGE")
     ps.add_property("oldHydrodynamicTorque", "walberla::mesa_pd::Vec3", defValue="real_t(0)",
                     syncMode="ON_OWNERSHIP_CHANGE")
+    ps.add_property("electrostaticForce", "walberla::mesa_pd::Vec3", defValue="real_t(0)",
+                    syncMode="ON_OWNERSHIP_CHANGE")
+
+    # Properties for evaluation purposes
+    ps.add_property("totalDisplacement", "walberla::real_t", defValue="real_t(0)", syncMode="ON_OWNERSHIP_CHANGE")
+    ps.add_property("collisionForceNorm", "walberla::real_t", defValue="real_t(0)", syncMode="ON_OWNERSHIP_CHANGE")
 
     # Properties for virtual mass:
     ps.add_property("virtualMass",                  "walberla::real_t",        defValue="real_t(0)",
@@ -150,6 +157,8 @@ if __name__ == '__main__':
     hftn = mpd.add(mpi.PropertyNotification('HydrodynamicForceTorqueNotification'))
     hftn.add_property('hydrodynamicForce', 'mesa_pd::Vec3', 'Vec3(real_t(0))')
     hftn.add_property('hydrodynamicTorque', 'mesa_pd::Vec3', 'Vec3(real_t(0))')
+    eftn = mpd.add(mpi.PropertyNotification('ElectrostaticForceNotification'))
+    eftn.add_property('electrostaticForce', 'mesa_pd::Vec3', 'Vec3(real_t(0))')
     hfn = mpd.add(mpi.PropertyNotification('HeatFluxNotification'))
     hfn.add_property('heatFlux', 'real_t', 'real_t(0)')
     ncn = mpd.add(mpi.PropertyNotification('NumContactNotification'))
diff --git a/python/mesa_pd/templates/common/ParticleFunctions.templ.h b/python/mesa_pd/templates/common/ParticleFunctions.templ.h
index 61629f05909a60069ccb2b91d69bf79c8a034f2f..d90adcf692baa4bf01dc31fee3a9d9d52fcb2b6d 100644
--- a/python/mesa_pd/templates/common/ParticleFunctions.templ.h
+++ b/python/mesa_pd/templates/common/ParticleFunctions.templ.h
@@ -93,19 +93,19 @@ inline void addForceAtomic(const size_t p_idx, Accessor& ac, const Vec3& f)
    #ifdef _OPENMP
    #pragma omp atomic
    #endif
-   {%- endif %};
+   {%- endif %}
    ac.getForceRef(p_idx)[0]  += f[0];
    {%- if module.enableOpenMP %}
    #ifdef _OPENMP
    #pragma omp atomic
    #endif
-   {%- endif %};
+   {%- endif %}
    ac.getForceRef(p_idx)[1]  += f[1];
    {%- if module.enableOpenMP %}
    #ifdef _OPENMP
    #pragma omp atomic
    #endif
-   {%- endif %};
+   {%- endif %}
    ac.getForceRef(p_idx)[2]  += f[2];
 }
 
@@ -117,19 +117,19 @@ inline void addForceAtWFPosAtomic(const size_t p_idx, Accessor& ac, const Vec3&
    #ifdef _OPENMP
    #pragma omp atomic
    #endif
-   {%- endif %};
+   {%- endif %}
    ac.getForceRef(p_idx)[0]  += f[0];
    {%- if module.enableOpenMP %}
    #ifdef _OPENMP
    #pragma omp atomic
    #endif
-   {%- endif %};
+   {%- endif %}
    ac.getForceRef(p_idx)[1]  += f[1];
    {%- if module.enableOpenMP %}
    #ifdef _OPENMP
    #pragma omp atomic
    #endif
-   {%- endif %};
+   {%- endif %}
    ac.getForceRef(p_idx)[2]  += f[2];
 
    const auto t = cross(( wf_pt - ac.getPosition(p_idx) ), f);
@@ -138,19 +138,19 @@ inline void addForceAtWFPosAtomic(const size_t p_idx, Accessor& ac, const Vec3&
    #ifdef _OPENMP
    #pragma omp atomic
    #endif
-   {%- endif %};
+   {%- endif %}
    ac.getTorqueRef(p_idx)[0] += t[0];
    {%- if module.enableOpenMP %}
    #ifdef _OPENMP
    #pragma omp atomic
    #endif
-   {%- endif %};
+   {%- endif %}
    ac.getTorqueRef(p_idx)[1] += t[1];
    {%- if module.enableOpenMP %}
    #ifdef _OPENMP
    #pragma omp atomic
    #endif
-   {%- endif %};
+   {%- endif %}
    ac.getTorqueRef(p_idx)[2] += t[2];
 }
 
@@ -165,19 +165,19 @@ inline void addTorqueAtomic(const size_t p_idx, Accessor& ac, const Vec3& t)
    #ifdef _OPENMP
    #pragma omp atomic
    #endif
-   {%- endif %};
+   {%- endif %}
    ac.getTorqueRef(p_idx)[0]  += t[0];
    {%- if module.enableOpenMP %}
    #ifdef _OPENMP
    #pragma omp atomic
    #endif
-   {%- endif %};
+   {%- endif %}
    ac.getTorqueRef(p_idx)[1]  += t[1];
    {%- if module.enableOpenMP %}
    #ifdef _OPENMP
    #pragma omp atomic
    #endif
-   {%- endif %};
+   {%- endif %}
    ac.getTorqueRef(p_idx)[2]  += t[2];
 }
 
diff --git a/python/mesa_pd/templates/kernel/LinearSpringDashpot.templ.h b/python/mesa_pd/templates/kernel/LinearSpringDashpot.templ.h
index efa4138873bc7c70f41b0ee02a4250fa418e6753..bec0b5bb815243f0e61a6d467d15da904b758488 100644
--- a/python/mesa_pd/templates/kernel/LinearSpringDashpot.templ.h
+++ b/python/mesa_pd/templates/kernel/LinearSpringDashpot.templ.h
@@ -240,7 +240,14 @@ inline void LinearSpringDashpot::operator()(const size_t p_idx1,
       const real_t fTabs( std::min( fTLS.length(), fFrictionAbs) );
       const Vec3   fT   ( fTabs * t );
 
-      //TODO check if tangential spring displacement is same for symmetric case
+      // TODO check if tangential spring displacement is same for symmetric case
+      // TODO: check why exactly this critical section is needed
+      {%- if module.enableOpenMP %}
+      #ifdef _OPENMP
+      #pragma omp critical
+      {
+      #endif
+      {%- endif %}
       auto& ch1 = ac.getNewContactHistoryRef(p_idx1)[ac.getUid(p_idx2)];
       ch1.setTangentialSpringDisplacement(newTangentialSpringDisplacement);
       ch1.setIsSticking(isSticking);
@@ -250,6 +257,11 @@ inline void LinearSpringDashpot::operator()(const size_t p_idx1,
       ch2.setTangentialSpringDisplacement(newTangentialSpringDisplacement);
       ch2.setIsSticking(isSticking);
       ch2.setImpactVelocityMagnitude(impactVelocityMagnitude);
+      {%- if module.enableOpenMP %}
+      #ifdef _OPENMP
+      }
+      #endif
+      {%- endif %}
 
       // Add normal force at contact point
       addForceAtWFPosAtomic( p_idx1, ac,  fN, contactPoint );
diff --git a/src/gpu/FieldIndexing.impl.h b/src/gpu/FieldIndexing.impl.h
index a8c9feccfbed0e12b015fe37dadac4aeaa803450..7ec1bb86a00d265c4158f3d37f170caa9b09735d 100644
--- a/src/gpu/FieldIndexing.impl.h
+++ b/src/gpu/FieldIndexing.impl.h
@@ -48,11 +48,11 @@ FieldIndexing<T>::FieldIndexing ( const GPUField<T> & field,
       {
          gpuDeviceProp prop;
          int count;
-         gpuGetDeviceCount(&count);
+         WALBERLA_GPU_CHECK(gpuGetDeviceCount(&count));
          int threadsPerBlock = std::numeric_limits< int >::max();
          for (int i = 0; i < count; i++)
          {
-            gpuGetDeviceProperties(&prop, i);
+            WALBERLA_GPU_CHECK(gpuGetDeviceProperties(&prop, i));
             threadsPerBlock = std::min(prop.maxThreadsPerBlock, threadsPerBlock);
          }
          WALBERLA_ASSERT_LESS(int_c(blockDim_.x), threadsPerBlock,
diff --git a/src/gpu/GPUWrapper.h b/src/gpu/GPUWrapper.h
index d4893da8b7ed123596fe18be62186670f990491d..7d103c298f691488817441132d3f0413b26d9ac9 100644
--- a/src/gpu/GPUWrapper.h
+++ b/src/gpu/GPUWrapper.h
@@ -35,6 +35,7 @@
 
     #define gpuMalloc cudaMalloc
     #define gpuMallocHost cudaMallocHost
+    #define gpuMallocManaged cudaMallocManaged
     #define gpuHostAllocDefault cudaHostAllocDefault
     #define gpuHostAlloc cudaHostAlloc
     #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
@@ -45,6 +46,7 @@
     #define gpuMemcpy3D cudaMemcpy3D
     #define gpuMemcpy3DParms cudaMemcpy3DParms
     #define gpuMemcpy3DAsync cudaMemcpy3DAsync
+    #define gpuMemset cudaMemset
 
     #define gpuMemset cudaMemset
     #define gpuMemsetAsync cudaMemsetAsync
@@ -99,6 +101,7 @@
 
     #define gpuMalloc hipMalloc
     #define gpuMallocHost hipHostMalloc
+    #define gpuMallocManaged hipMallocManaged
     #define gpuHostAllocDefault hipHostMallocDefault
     // warning: 'hipHostAlloc' is deprecated: use hipHostMalloc insteadwarning: 'hipHostAlloc' is deprecated: use hipHostMalloc instead
     #define gpuHostAlloc hipHostMalloc
@@ -110,6 +113,7 @@
     #define gpuMemcpy3D hipMemcpy3D
     #define gpuMemcpy3DParms hipMemcpy3DParms
     #define gpuMemcpy3DAsync hipMemcpy3DAsync
+    #define gpuMemset hipMemset
 
     #define gpuMemset hipMemset
     #define gpuMemsetAsync hipMemsetAsync
@@ -144,7 +148,7 @@
 
     #define gpuGetDeviceCount hipGetDeviceCount
     #define gpuSetDevice hipSetDevice
-    #define gpuDeviceProp hipDeviceProp
+    #define gpuDeviceProp hipDeviceProp_t
     #define gpuGetDeviceProperties hipGetDeviceProperties
 
     #define gpuLaunchKernel hipLaunchKernel
diff --git a/src/gpu/Kernel.h b/src/gpu/Kernel.h
index f6c2eb687a1d54e6aea2b21f80bfd200d05b371f..59366e5bc6d2bcdbd990cfd8d968a07121efb904 100644
--- a/src/gpu/Kernel.h
+++ b/src/gpu/Kernel.h
@@ -145,6 +145,18 @@ namespace gpu
       CHECK_PARAMETER_FUNC(5)
       CHECK_PARAMETER_FUNC(6)
       CHECK_PARAMETER_FUNC(7)
+      CHECK_PARAMETER_FUNC(8)
+      CHECK_PARAMETER_FUNC(9)
+      CHECK_PARAMETER_FUNC(10)
+      CHECK_PARAMETER_FUNC(11)
+      CHECK_PARAMETER_FUNC(12)
+      CHECK_PARAMETER_FUNC(13)
+      CHECK_PARAMETER_FUNC(14)
+      CHECK_PARAMETER_FUNC(15)
+      CHECK_PARAMETER_FUNC(16)
+      CHECK_PARAMETER_FUNC(17)
+      CHECK_PARAMETER_FUNC(18)
+      CHECK_PARAMETER_FUNC(19)
 
       #undef CHECK_PARAMETER_FUNC
 
@@ -256,6 +268,18 @@ namespace gpu
          case 5: return checkParameter5<T>();
          case 6: return checkParameter6<T>();
          case 7: return checkParameter7<T>();
+         case 8: return checkParameter8<T>();
+         case 9: return checkParameter9<T>();
+         case 10: return checkParameter10<T>();
+         case 11: return checkParameter11<T>();
+         case 12: return checkParameter12<T>();
+         case 13: return checkParameter13<T>();
+         case 14: return checkParameter14<T>();
+         case 15: return checkParameter15<T>();
+         case 16: return checkParameter16<T>();
+         case 17: return checkParameter17<T>();
+         case 18: return checkParameter18<T>();
+         case 19: return checkParameter19<T>();
          default:
             WALBERLA_ABORT("Too many parameters passed to kernel")
       }
diff --git a/src/lbm/PerformanceLogger.h b/src/lbm/PerformanceLogger.h
index 337cf6832e13ab68adb17697ffa1a9f17ab0f7de..13b2a4601cb306bbd2a870381c7a9b0164bd482f 100644
--- a/src/lbm/PerformanceLogger.h
+++ b/src/lbm/PerformanceLogger.h
@@ -167,7 +167,9 @@ void PerformanceLogger<FlagField_T>::getBestResultsForSQLOnRoot( std::map< std::
                                                                  std::map< std::string, double > &      realProperties,
                                                                  std::map< std::string, std::string > & stringProperties )
 {
+   timer_.end();
    performanceEvaluation_.getResultsForSQLOnRoot( integerProperties, realProperties, stringProperties, interval_, getTiming( MAX ) );
+   timer_.start();
 }
 
 
diff --git a/src/lbm_mesapd_coupling/CMakeLists.txt b/src/lbm_mesapd_coupling/CMakeLists.txt
index bde22da3f4a2f4b4f6edcaadc6de8f562292b54f..1e04ba7fd21b3dcf4e4ffc3518c833a4460205ed 100644
--- a/src/lbm_mesapd_coupling/CMakeLists.txt
+++ b/src/lbm_mesapd_coupling/CMakeLists.txt
@@ -12,8 +12,20 @@ target_sources( lbm_mesapd_coupling
       DataTypes.h
       )
 
+# Maximum number of particles that may overlap with a cell. For fully resolved particles, 2 should normally be
+# sufficient (for a sufficiently high stiffness in the DEM).
+set(MAX_PARTICLES_PER_CELL 2)
+add_custom_target(MAX_PARTICLES_PER_CELL) # Make it a target such that the code generation runs again if changed
+target_sources( lbm_mesapd_coupling
+        PRIVATE
+        DataTypesCodegen.h
+        )
+target_compile_definitions(lbm_mesapd_coupling PUBLIC MAX_PARTICLES_PER_CELL=${MAX_PARTICLES_PER_CELL})
+
+
 add_subdirectory( amr )
 add_subdirectory( momentum_exchange_method )
+add_subdirectory( overlapping )
 add_subdirectory( partially_saturated_cells_method )
 add_subdirectory( utility )
 add_subdirectory( mapping )
diff --git a/src/lbm_mesapd_coupling/DataTypesCodegen.h b/src/lbm_mesapd_coupling/DataTypesCodegen.h
new file mode 100644
index 0000000000000000000000000000000000000000..99583a6131a846193826d968e9b8695c5d51d4f6
--- /dev/null
+++ b/src/lbm_mesapd_coupling/DataTypesCodegen.h
@@ -0,0 +1,132 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file DataTypesGPU.h
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "core/DataTypes.h"
+
+#include "field/AddToStorage.h"
+#include "field/GhostLayerField.h"
+
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+#   include "gpu/AddGPUFieldToStorage.h"
+#   include "gpu/GPUField.h"
+#endif
+
+namespace walberla
+{
+namespace lbm_mesapd_coupling
+{
+namespace psm
+{
+namespace gpu
+{
+
+const uint MaxParticlesPerCell = MAX_PARTICLES_PER_CELL; // MAX_PARTICLES_PER_CELL comes from CMake
+
+// nOverlappingParticlesField is used to store the amount of overlapping particles per cell
+// B denotes the local weighting factor and is calculated by taking the sum of all local particle
+// weighting factor Bs. The naming of the variables is based on the following paper:
+// https://doi.org/10.1016/j.compfluid.2017.05.033
+// idxField is used to store the indices of the overlapping particles
+// particleVelocitiesField is used to store the velocities of the overlapping particles evaluated at the cell center
+// particleForcesField is used to store the hydrodynamic forces of the cell acting on the overlapping particles
+
+using nOverlappingParticlesField_T = GhostLayerField< uint_t, 1 >;
+using BsField_T                    = GhostLayerField< real_t, MaxParticlesPerCell >;
+using idxField_T                   = GhostLayerField< size_t, MaxParticlesPerCell >;
+using BField_T                     = GhostLayerField< real_t, 1 >;
+using particleVelocitiesField_T    = GhostLayerField< real_t, MaxParticlesPerCell * 3 >;
+using particleForcesField_T        = GhostLayerField< real_t, MaxParticlesPerCell * 3 >;
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+using nOverlappingParticlesFieldGPU_T = walberla::gpu::GPUField< uint_t >;
+using BsFieldGPU_T                    = walberla::gpu::GPUField< real_t >;
+using idxFieldGPU_T                   = walberla::gpu::GPUField< size_t >;
+using BFieldGPU_T                     = walberla::gpu::GPUField< real_t >;
+using particleVelocitiesFieldGPU_T    = walberla::gpu::GPUField< real_t >;
+using particleForcesFieldGPU_T        = walberla::gpu::GPUField< real_t >;
+#endif
+
+// The ParticleAndVolumeFractionSoA encapsulates the data needed by the routines involved in the coupling
+template< int Weighting_T >
+struct ParticleAndVolumeFractionSoA_T
+{
+   BlockDataID nOverlappingParticlesFieldID;
+   BlockDataID BsFieldID;
+   BlockDataID idxFieldID;
+   BlockDataID BFieldID;
+   BlockDataID particleVelocitiesFieldID;
+   BlockDataID particleForcesFieldID;
+   // relaxation rate omega is used for Weighting_T != 1
+   real_t omega_;
+   // UIDs of the particles are stored during mapping, and it is checked that they are the same during the PSM kernel.
+   // This prevents running into troubles due to changed indices
+   std::vector< walberla::id_t > mappingUIDs;
+   // Store positions globally to avoid copying them from CPU to GPU in multiple sweeps
+   real_t* positions = nullptr;
+
+   // nrOfGhostLayers is also 1 for the fields that do not need a ghost layer since the generated sweeps can only handle
+   // fields with the same number of ghost layerserated kernels)
+   ParticleAndVolumeFractionSoA_T(const shared_ptr< StructuredBlockStorage >& bs, const real_t omega)
+   {
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+      nOverlappingParticlesFieldID = walberla::gpu::addGPUFieldToStorage< nOverlappingParticlesFieldGPU_T >(
+         bs, "number of overlapping particles field GPU", uint_t(1), field::fzyx, uint_t(1), true);
+      BsFieldID  = walberla::gpu::addGPUFieldToStorage< BsFieldGPU_T >(bs, "Bs field GPU", MaxParticlesPerCell,
+                                                                       field::fzyx, uint_t(1), true);
+      idxFieldID = walberla::gpu::addGPUFieldToStorage< idxFieldGPU_T >(bs, "idx field GPU", MaxParticlesPerCell,
+                                                                        field::fzyx, uint_t(1), true);
+      BFieldID = walberla::gpu::addGPUFieldToStorage< BFieldGPU_T >(bs, "B field GPU", 1, field::fzyx, uint_t(1), true);
+      particleVelocitiesFieldID = walberla::gpu::addGPUFieldToStorage< particleVelocitiesFieldGPU_T >(
+         bs, "particle velocities field GPU", uint_t(MaxParticlesPerCell * 3), field::fzyx, uint_t(1), true);
+      particleForcesFieldID = walberla::gpu::addGPUFieldToStorage< particleForcesFieldGPU_T >(
+         bs, "particle forces field GPU", uint_t(MaxParticlesPerCell * 3), field::fzyx, uint_t(1), true);
+#else
+      nOverlappingParticlesFieldID = field::addToStorage< nOverlappingParticlesField_T >(
+         bs, "number of overlapping particles field CPU", uint_t(0), field::fzyx, uint_t(1), true);
+      BsFieldID  = field::addToStorage< BsField_T >(bs, "Bs field CPU", real_t(0), field::fzyx, uint_t(1), true);
+      idxFieldID = field::addToStorage< idxField_T >(bs, "idx field CPU", uint_t(0), field::fzyx, uint_t(1), true);
+      BFieldID   = field::addToStorage< BField_T >(bs, "B field CPU", real_t(0), field::fzyx, uint_t(1), true);
+      particleVelocitiesFieldID = field::addToStorage< particleVelocitiesField_T >(
+         bs, "particle velocities field CPU", real_t(0), field::fzyx, uint_t(1), true);
+      particleForcesFieldID = field::addToStorage< particleForcesField_T >(bs, "particle forces field CPU", real_t(0),
+                                                                           field::fzyx, uint_t(1), true);
+#endif
+      omega_ = omega;
+   }
+
+   ~ParticleAndVolumeFractionSoA_T()
+   {
+      if (positions != nullptr)
+      {
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+         WALBERLA_GPU_CHECK(gpuFree(positions));
+#else
+         free(positions);
+#endif
+      }
+   }
+};
+
+} // namespace gpu
+} // namespace psm
+} // namespace lbm_mesapd_coupling
+} // namespace walberla
diff --git a/src/lbm_mesapd_coupling/overlapping/CMakeLists.txt b/src/lbm_mesapd_coupling/overlapping/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1dfa6dc12d91a77a01f15baa8200835fc78c2fbb
--- /dev/null
+++ b/src/lbm_mesapd_coupling/overlapping/CMakeLists.txt
@@ -0,0 +1,4 @@
+target_sources(lbm_mesapd_coupling
+        PRIVATE
+        OverlapFraction.h
+        )
diff --git a/src/lbm_mesapd_coupling/overlapping/OverlapFraction.h b/src/lbm_mesapd_coupling/overlapping/OverlapFraction.h
index 3dbf68e039af2494020a8ec045f60189a318fd9c..c1fa9d7c99583356f122595d8d8df4190f3e82ad 100644
--- a/src/lbm_mesapd_coupling/overlapping/OverlapFraction.h
+++ b/src/lbm_mesapd_coupling/overlapping/OverlapFraction.h
@@ -16,7 +16,7 @@
 //! \file OverlapFraction.h
 //! \ingroup lbm_mesapd_coupling
 //! \author Samuel Kemmler <samuel.kemmler@fau.de>
-//! \brief Functor that provides overlap fraction computations for different MESA-PD shapes (used for SingleCast)
+//! \brief Functor that provides overlap fraction computations for different MESA-PD shapes (used by SingleCast)
 //
 //======================================================================================================================
 
diff --git a/src/lbm_mesapd_coupling/partially_saturated_cells_method/CMakeLists.txt b/src/lbm_mesapd_coupling/partially_saturated_cells_method/CMakeLists.txt
index 65b1c468c80828ea27ca30c9cc8f712e137803cb..4a87ce81e6dc56c3a5f3bbeb1b6d988ca2bed00c 100644
--- a/src/lbm_mesapd_coupling/partially_saturated_cells_method/CMakeLists.txt
+++ b/src/lbm_mesapd_coupling/partially_saturated_cells_method/CMakeLists.txt
@@ -4,3 +4,5 @@ target_sources( lbm_mesapd_coupling
     ParticleAndVolumeFractionMapping.h
     PSMUtility.h
     )
+
+add_subdirectory(codegen)
diff --git a/src/lbm_mesapd_coupling/partially_saturated_cells_method/PSMSweep.h b/src/lbm_mesapd_coupling/partially_saturated_cells_method/PSMSweep.h
index 8ad996ba8c1a5b4ea61a4add9e29f1a9de67196b..d056e2e76ab010cd2732297778a0e4d6750560cd 100644
--- a/src/lbm_mesapd_coupling/partially_saturated_cells_method/PSMSweep.h
+++ b/src/lbm_mesapd_coupling/partially_saturated_cells_method/PSMSweep.h
@@ -221,7 +221,7 @@ void PSMSweep< LatticeModel_T, Filter_T, DensityVelocityIn_T, DensityVelocityOut
                   // total coverage ratio in the cell
                   real_t Bn = real_t(0);
 
-                  // averaged solid collision operator for all intersecting bodies s
+                  // averaged solid collision operator for all intersecting particles s
                   // = \sum_s B_s * \Omega_s_i
                   std::vector< real_t > omega_n(Stencil_T::Size, real_t(0));
 
@@ -398,7 +398,7 @@ void PSMSweep< LatticeModel_T, Filter_T, DensityVelocityIn_T, DensityVelocityOut
                   // total coverage ratio in the cell
                   real_t Bn = real_t(0);
 
-                  // averaged solid collision operator for all intersecting bodies s
+                  // averaged solid collision operator for all intersecting particles s
                   // = \sum_s B_s * \Omega_s_i
                   std::vector< real_t > omega_n(Stencil_T::Size, real_t(0));
 
diff --git a/src/lbm_mesapd_coupling/partially_saturated_cells_method/PSMUtility.h b/src/lbm_mesapd_coupling/partially_saturated_cells_method/PSMUtility.h
index 854d27f99afd0b40c4eab0ed843ad951a2323cec..aaacd683ee7b50f88ae638fcabb3f547948b6561 100644
--- a/src/lbm_mesapd_coupling/partially_saturated_cells_method/PSMUtility.h
+++ b/src/lbm_mesapd_coupling/partially_saturated_cells_method/PSMUtility.h
@@ -51,10 +51,10 @@ namespace psm
  * Weighting_T is like in the PSMSweep.
  */
 template< typename LatticeModel_T, int Weighting_T, typename ParticleAccessor_T >
-Vector3< real_t > getPSMMacroscopicVelocity(
-   const IBlock& block, lbm::PdfField< LatticeModel_T >* pdfField,
-   GhostLayerField< std::vector< std::pair< walberla::size_t, real_t > >, 1 >* particleAndVolumeFractionField,
-   StructuredBlockStorage& blockStorage, const Cell& cell, const ParticleAccessor_T& ac)
+Vector3< real_t > getPSMMacroscopicVelocity(const IBlock& block, lbm::PdfField< LatticeModel_T >* pdfField,
+                                            ParticleAndVolumeFractionField_T* particleAndVolumeFractionField,
+                                            StructuredBlockStorage& blockStorage, const Cell& cell,
+                                            const ParticleAccessor_T& ac)
 {
    static_assert(LatticeModel_T::compressible == false, "Only works with incompressible models!");
    WALBERLA_ASSERT_NOT_NULLPTR(pdfField);
@@ -87,16 +87,16 @@ Vector3< real_t > getPSMMacroscopicVelocity(
    return velocity;
 }
 
-/*!\brief Initializes the PDF field inside the bodies according to the velocities of the bodies.
+/*!\brief Initializes the PDF field inside the particles according to the velocities of the particles.
  *
- * As the Partially Saturated Cells method relies on executing the LBM sweep also inside the bodies, it is good practice
- * (and for some PSM variants also required) to initialize the PDF field ( i.e. the velocity ) in agreement with
- * possible initial velocities of the bodies. This is also the case in the presence of external forces acting on the
- * fluid, as these will often shift the macroscopic velocities during the initialization of the PDF field.
+ * As the Partially Saturated Cells method relies on executing the LBM sweep also inside the particles, it is good
+ * practice (and for some PSM variants also required) to initialize the PDF field ( i.e. the velocity ) in agreement
+ * with possible initial velocities of the particles. This is also the case in the presence of external forces acting on
+ * the fluid, as these will often shift the macroscopic velocities during the initialization of the PDF field.
  *
  * Note, that the ParticleAndVolumeFractionMapping for PSM has to be called first to have a valid field.
  *
- * Only the velocity of cells intersecting with bodies is set, pure fluid cells remain unchanged.
+ * Only the velocity of cells intersecting with particles is set, pure fluid cells remain unchanged.
  */
 template< typename LatticeModel_T, int Weighting_T, typename ParticleAccessor_T >
 void initializeDomainForPSM(StructuredBlockStorage& blockStorage, const BlockDataID& pdfFieldID,
@@ -144,7 +144,7 @@ void initializeDomainForPSM(StructuredBlockStorage& blockStorage, const BlockDat
             Vector3< real_t > fluidVelocityInCell(real_t(0));
             const real_t rho = pdfField->getDensityAndVelocity(fluidVelocityInCell, cell);
 
-            // set the PDFs to equilibrium with the density rho and the average velocity of all intersecting bodies
+            // set the PDFs to equilibrium with the density rho and the average velocity of all intersecting particles
             pdfField->setToEquilibrium(cell,
                                        fluidVelocityInCell * (real_c(1) - totalSolidWeightingInCell) +
                                           weightedAverageParticleVelocityInCell,
diff --git a/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/CMakeLists.txt b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..82b23cd774f80475b781e4bbc092728757277276
--- /dev/null
+++ b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/CMakeLists.txt
@@ -0,0 +1,35 @@
+target_sources(lbm_mesapd_coupling
+        PRIVATE
+        ParticleAndVolumeFractionMappingSweepsCPU.h
+        PSMSweepCollection.h
+        PSMWrapperSweepsCPU.h
+)
+if (WALBERLA_BUILD_WITH_CODEGEN)
+    foreach (collision_setup srt trt mrt cumulant srt-smagorinsky trt-smagorinsky)
+        foreach (solid_collision 1 2 3)
+            set(config ${collision_setup}_sc${solid_collision})
+            walberla_generate_target_from_python(NAME PSMCodegenPython_${config}
+                    FILE PSMCodegen.py
+                    CODEGEN_CFG ${config}_${MAX_PARTICLES_PER_CELL}
+                    OUT_FILES LBMSweep.${CODEGEN_FILE_SUFFIX} LBMSweep.h LBMSplitSweep.${CODEGEN_FILE_SUFFIX} LBMSplitSweep.h
+                    PSMSweep.${CODEGEN_FILE_SUFFIX} PSMSweep.h PSMSweepSplit.${CODEGEN_FILE_SUFFIX} PSMSweepSplit.h
+                    PSMPackInfo.${CODEGEN_FILE_SUFFIX} PSMPackInfo.h InitializeDomainForPSM.${CODEGEN_FILE_SUFFIX} InitializeDomainForPSM.h
+                    PSM_NoSlip.${CODEGEN_FILE_SUFFIX} PSM_NoSlip.h PSM_UBB.${CODEGEN_FILE_SUFFIX} PSM_UBB.h PSM_Density.${CODEGEN_FILE_SUFFIX} PSM_Density.h
+                    PSM_FreeSlip.${CODEGEN_FILE_SUFFIX} PSM_FreeSlip.h PSM_InfoHeader.h PSM_MacroGetter.cpp PSM_MacroGetter.h
+                    PSM_MacroSetter.cpp PSM_MacroSetter.h)
+            add_dependencies(PSMCodegenPython_${config} MAX_PARTICLES_PER_CELL)
+        endforeach ()
+    endforeach ()
+endif ()
+if (WALBERLA_BUILD_WITH_GPU_SUPPORT AND (CMAKE_CUDA_ARCHITECTURES GREATER_EQUAL 60 OR WALBERLA_BUILD_WITH_HIP))
+    target_sources(lbm_mesapd_coupling
+            PRIVATE
+            ParticleAndVolumeFractionMappingSweepsGPU.h
+            ParticleAndVolumeFractionMappingKernels.${CODEGEN_FILE_SUFFIX}
+            ParticleAndVolumeFractionMappingKernels.h
+            PSMUtilityGPU.h
+            PSMWrapperKernels.${CODEGEN_FILE_SUFFIX}
+            PSMWrapperKernels.h
+            PSMWrapperSweepsGPU.h
+    )
+endif ()
diff --git a/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMCodegen.py b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMCodegen.py
new file mode 100644
index 0000000000000000000000000000000000000000..7564b54db0b76e863990849b0474e1a4f6abfd7d
--- /dev/null
+++ b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMCodegen.py
@@ -0,0 +1,272 @@
+import copy
+import sympy as sp
+import pystencils as ps
+from sympy.core.add import Add
+from sympy.codegen.ast import Assignment
+
+from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil, ForceModel
+from lbmpy.partially_saturated_cells import PSMConfig
+
+from lbmpy.boundaries import NoSlip, UBB, FixedDensity, FreeSlip
+from lbmpy.creationfunctions import (
+    create_lb_update_rule,
+    create_lb_method,
+    create_psm_update_rule,
+)
+
+from lbmpy.macroscopic_value_kernels import (
+    macroscopic_values_getter,
+    macroscopic_values_setter,
+)
+
+from pystencils_walberla import (
+    CodeGeneration,
+    generate_info_header,
+    generate_sweep,
+    generate_pack_info_from_kernel,
+)
+
+from lbmpy_walberla import generate_boundary
+
+# Based on the following paper: https://doi.org/10.1016/j.compfluid.2017.05.033
+
+info_header = """
+const char * infoStencil = "{stencil}";
+const char * infoStreamingPattern = "{streaming_pattern}";
+const char * infoCollisionSetup = "{collision_setup}";
+const bool infoCseGlobal = {cse_global};
+const bool infoCsePdfs = {cse_pdfs};
+"""
+
+with CodeGeneration() as ctx:
+    data_type = "float64" if ctx.double_accuracy else "float32"
+    stencil = LBStencil(Stencil.D3Q27)
+    omega = sp.Symbol("omega")
+    init_density = sp.Symbol("init_density")
+    init_velocity = sp.symbols("init_velocity_:3")
+    pdfs_inter = sp.symbols("pdfs_inter:" + str(stencil.Q))
+    layout = "fzyx"
+    config_tokens = ctx.config.split("_")
+    MaxParticlesPerCell = int(config_tokens[2])
+    methods = {
+        "srt": Method.SRT,
+        "trt": Method.TRT,
+        "mrt": Method.MRT,
+        "cumulant": Method.MONOMIAL_CUMULANT,
+        "srt-smagorinsky": Method.SRT,
+        "trt-smagorinsky": Method.TRT,
+    }
+    # Solid collision variant
+    SC = int(config_tokens[1][2])
+
+    pdfs, pdfs_tmp, velocity_field, density_field = ps.fields(
+        f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}), velocity_field({stencil.D}), density_field({1}): {data_type}[3D]",
+        layout=layout,
+    )
+
+    particle_velocities, particle_forces, Bs = ps.fields(
+        f"particle_v({MaxParticlesPerCell * stencil.D}), particle_f({MaxParticlesPerCell * stencil.D}), Bs({MaxParticlesPerCell}): {data_type}[3D]",
+        layout=layout,
+    )
+
+    # Solid fraction field
+    B = ps.fields(f"b({1}): {data_type}[3D]", layout=layout)
+
+    psm_opt = LBMOptimisation(
+        cse_global=True,
+        symbolic_field=pdfs,
+        symbolic_temporary_field=pdfs_tmp,
+        field_layout=layout,
+    )
+
+    psm_config = PSMConfig(
+        fraction_field=B,
+        object_velocity_field=particle_velocities,
+        SC=SC,
+        MaxParticlesPerCell=MaxParticlesPerCell,
+        individual_fraction_field=Bs,
+        particle_force_field=particle_forces,
+    )
+
+    lbm_config = LBMConfig(
+        stencil=stencil,
+        method=methods[config_tokens[0]],
+        relaxation_rate=omega,
+        force=sp.symbols("F_:3"),
+        force_model=ForceModel.LUO,
+        compressible=True,
+        psm_config=psm_config,
+    )
+
+    if config_tokens[0] == "srt-smagorinsky" or config_tokens[0] == "trt-smagorinsky":
+        lbm_config.smagorinsky = True
+
+    # =====================
+    # Generate method
+    # =====================
+
+    method = create_lb_method(lbm_config=lbm_config)
+
+    node_collection = create_psm_update_rule(lbm_config, psm_opt)
+
+    pdfs_setter = macroscopic_values_setter(
+        method, init_density, init_velocity, pdfs.center_vector
+    )
+
+    # Use average velocity of all intersecting particles when setting PDFs (mandatory for SC=3)
+    for i, sub_exp in enumerate(pdfs_setter.subexpressions[-3:]):
+        rhs = []
+        for summand in sub_exp.rhs.args:
+            rhs.append(summand * (1.0 - B.center))
+        for p in range(MaxParticlesPerCell):
+            rhs.append(particle_velocities(p * stencil.D + i) * Bs.center(p))
+        pdfs_setter.subexpressions.remove(sub_exp)
+        pdfs_setter.subexpressions.append(Assignment(sub_exp.lhs, Add(*rhs)))
+
+    # =====================
+    # Write method to files
+    # =====================
+
+    if ctx.gpu:
+        target = ps.Target.GPU
+    else:
+        target = ps.Target.CPU
+
+    # Generate files
+    generate_sweep(
+        ctx,
+        "PSMSweep",
+        node_collection,
+        field_swaps=[(pdfs, pdfs_tmp)],
+        target=target,
+    )
+
+    generate_sweep(
+        ctx,
+        "PSMSweepSplit",
+        node_collection,
+        field_swaps=[(pdfs, pdfs_tmp)],
+        target=target,
+        inner_outer_split=True,
+    )
+
+    config_without_psm = LBMConfig(
+        stencil=stencil,
+        method=methods[config_tokens[0]],
+        relaxation_rate=omega,
+        force=sp.symbols("F_:3"),
+        force_model=ForceModel.LUO,
+        compressible=True,
+    )
+
+    if config_tokens[0] == "srt-smagorinsky" or config_tokens[0] == "trt-smagorinsky":
+        config_without_psm.smagorinsky = True
+
+    generate_sweep(
+        ctx,
+        "LBMSweep",
+        create_lb_update_rule(lbm_config=config_without_psm, lbm_optimisation=psm_opt),
+        field_swaps=[(pdfs, pdfs_tmp)],
+        target=target,
+    )
+
+    generate_sweep(
+        ctx,
+        "LBMSplitSweep",
+        create_lb_update_rule(lbm_config=config_without_psm, lbm_optimisation=psm_opt),
+        field_swaps=[(pdfs, pdfs_tmp)],
+        target=target,
+        inner_outer_split=True,
+    )
+
+    generate_pack_info_from_kernel(
+        ctx,
+        "PSMPackInfo",
+        create_lb_update_rule(lbm_config=lbm_config, lbm_optimisation=psm_opt),
+        target=target,
+    )
+
+    generate_sweep(ctx, "InitializeDomainForPSM", pdfs_setter, target=target)
+
+    # Boundary conditions
+    generate_boundary(
+        ctx,
+        "PSM_NoSlip",
+        NoSlip(),
+        method,
+        field_name=pdfs.name,
+        streaming_pattern="pull",
+        target=target,
+    )
+
+    bc_velocity = sp.symbols("bc_velocity_:3")
+    generate_boundary(
+        ctx,
+        "PSM_UBB",
+        UBB(bc_velocity),
+        method,
+        field_name=pdfs.name,
+        streaming_pattern="pull",
+        target=target,
+    )
+
+    bc_density = sp.Symbol("bc_density")
+    generate_boundary(
+        ctx,
+        "PSM_Density",
+        FixedDensity(bc_density),
+        method,
+        field_name=pdfs.name,
+        streaming_pattern="pull",
+        target=target,
+    )
+
+    generate_boundary(
+        ctx,
+        "PSM_FreeSlip",
+        FreeSlip(stencil),
+        method,
+        field_name=pdfs.name,
+        streaming_pattern="pull",
+        target=target,
+    )
+
+    # Info header containing correct template definitions for stencil and fields
+    infoHeaderParams = {
+        "stencil": stencil.name,
+        "streaming_pattern": lbm_config.streaming_pattern,
+        "collision_setup": config_tokens[0],
+        "cse_global": int(psm_opt.cse_global),
+        "cse_pdfs": int(psm_opt.cse_pdfs),
+    }
+
+    stencil_typedefs = {"Stencil_T": stencil, "CommunicationStencil_T": stencil}
+    field_typedefs = {
+        "PdfField_T": pdfs,
+        "DensityField_T": density_field,
+        "VelocityField_T": velocity_field,
+    }
+
+    generate_info_header(
+        ctx,
+        "PSM_InfoHeader",
+        stencil_typedefs=stencil_typedefs,
+        field_typedefs=field_typedefs,
+        additional_code=info_header.format(**infoHeaderParams),
+    )
+
+    # Getter & setter to compute moments from pdfs
+    setter_assignments = macroscopic_values_setter(
+        method,
+        velocity=velocity_field.center_vector,
+        pdfs=pdfs.center_vector,
+        density=1.0,
+    )
+    getter_assignments = macroscopic_values_getter(
+        method,
+        density=density_field,
+        velocity=velocity_field.center_vector,
+        pdfs=pdfs.center_vector,
+    )
+    generate_sweep(ctx, "PSM_MacroSetter", setter_assignments)
+    generate_sweep(ctx, "PSM_MacroGetter", getter_assignments)
diff --git a/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMSweepCollection.h b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMSweepCollection.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec0ed85e2ab09b86eee1034d9f9ca8d9c2120b67
--- /dev/null
+++ b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMSweepCollection.h
@@ -0,0 +1,124 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PSMSweepCollection.h
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+#   include "lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMWrapperSweepsGPU.h"
+#   include "lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMappingSweepsGPU.h"
+#else
+#   include "lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMWrapperSweepsCPU.h"
+#   include "lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMappingSweepsCPU.h"
+#endif
+
+namespace walberla
+{
+namespace lbm_mesapd_coupling
+{
+namespace psm
+{
+namespace gpu
+{
+
+// The deviceSyncWrapper can be used so that the timeloop measures the correct device runtime
+auto deviceSyncWrapper = [](std::function< void(IBlock*) > sweep) {
+   return [sweep](IBlock* b) {
+      sweep(b);
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize());
+#endif
+   };
+};
+
+template< typename ParticleAccessor_T, typename ParticleSelector_T, int Weighting_T >
+class PSMSweepCollection
+{
+ public:
+   PSMSweepCollection(const shared_ptr< StructuredBlockStorage >& bs, const shared_ptr< ParticleAccessor_T >& ac,
+                      const ParticleSelector_T& ps,
+                      ParticleAndVolumeFractionSoA_T< Weighting_T >& particleAndVolumeFractionSoA,
+                      const Vector3< uint_t > particleSubBlockSize = Vector3< uint_t >(10))
+      : particleMappingSweep(SphereFractionMappingSweep< ParticleAccessor_T, ParticleSelector_T, Weighting_T >(
+           bs, ac, ps, particleAndVolumeFractionSoA, particleSubBlockSize)),
+        setParticleVelocitiesSweep(SetParticleVelocitiesSweep< ParticleAccessor_T, ParticleSelector_T, Weighting_T >(
+           bs, ac, ps, particleAndVolumeFractionSoA)),
+        reduceParticleForcesSweep(ReduceParticleForcesSweep< ParticleAccessor_T, ParticleSelector_T, Weighting_T >(
+           bs, ac, ps, particleAndVolumeFractionSoA))
+   {}
+   SphereFractionMappingSweep< ParticleAccessor_T, ParticleSelector_T, Weighting_T > particleMappingSweep;
+   SetParticleVelocitiesSweep< ParticleAccessor_T, ParticleSelector_T, Weighting_T > setParticleVelocitiesSweep;
+   ReduceParticleForcesSweep< ParticleAccessor_T, ParticleSelector_T, Weighting_T > reduceParticleForcesSweep;
+};
+
+template< typename SweepCollection, typename PSMSweep >
+void addPSMSweepsToTimeloop(SweepTimeloop& timeloop, SweepCollection& psmSweepCollection, PSMSweep& psmSweep,
+                            bool synchronize = true)
+{
+   if (synchronize)
+   {
+      timeloop.add() << Sweep(deviceSyncWrapper(psmSweepCollection.particleMappingSweep), "Particle mapping");
+      timeloop.add() << Sweep(deviceSyncWrapper(psmSweepCollection.setParticleVelocitiesSweep),
+                              "Set particle velocities");
+      timeloop.add() << Sweep(deviceSyncWrapper(psmSweep), "PSM sweep");
+      timeloop.add() << Sweep(deviceSyncWrapper(psmSweepCollection.reduceParticleForcesSweep),
+                              "Reduce particle forces");
+   }
+   else
+   {
+      timeloop.add() << Sweep(psmSweepCollection.particleMappingSweep, "Particle mapping");
+      timeloop.add() << Sweep(psmSweepCollection.setParticleVelocitiesSweep, "Set particle velocities");
+      timeloop.add() << Sweep(psmSweep, "PSM sweep");
+      timeloop.add() << Sweep(psmSweepCollection.reduceParticleForcesSweep, "Reduce particle forces");
+   };
+}
+
+template< typename SweepCollection, typename PSMSweep, typename Communication >
+void addPSMSweepsToTimeloops(SweepTimeloop& commTimeloop, SweepTimeloop& timeloop, Communication& comm,
+                             SweepCollection& psmSweepCollection, PSMSweep& psmSweep, bool synchronize = true)
+{
+   if (synchronize)
+   {
+      commTimeloop.add() << BeforeFunction([&]() { comm.startCommunication(); })
+                         << Sweep(deviceSyncWrapper(psmSweepCollection.particleMappingSweep), "Particle mapping");
+      commTimeloop.add() << Sweep(deviceSyncWrapper(psmSweepCollection.setParticleVelocitiesSweep),
+                                  "Set particle velocities");
+      commTimeloop.add() << Sweep(deviceSyncWrapper(psmSweep.getInnerSweep()), "PSM inner sweep")
+                         << AfterFunction([&]() { comm.wait(); }, "LBM Communication (wait)");
+      timeloop.add() << Sweep(deviceSyncWrapper(psmSweep.getOuterSweep()), "PSM outer sweep");
+      timeloop.add() << Sweep(deviceSyncWrapper(psmSweepCollection.reduceParticleForcesSweep),
+                              "Reduce particle forces");
+   }
+   else
+   {
+      commTimeloop.add() << BeforeFunction([&]() { comm.startCommunication(); })
+                         << Sweep(psmSweepCollection.particleMappingSweep, "Particle mapping");
+      commTimeloop.add() << Sweep(psmSweepCollection.setParticleVelocitiesSweep, "Set particle velocities");
+      commTimeloop.add() << Sweep(psmSweep.getInnerSweep(), "PSM inner sweep")
+                         << AfterFunction([&]() { comm.wait(); }, "LBM Communication (wait)");
+      timeloop.add() << Sweep(psmSweep.getOuterSweep(), "PSM outer sweep");
+      timeloop.add() << Sweep(psmSweepCollection.reduceParticleForcesSweep, "Reduce particle forces");
+   };
+}
+
+} // namespace gpu
+} // namespace psm
+} // namespace lbm_mesapd_coupling
+} // namespace walberla
diff --git a/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMUtilityGPU.h b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMUtilityGPU.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ac6d0a9a53321b18c964da6bdc5b72df501b788
--- /dev/null
+++ b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMUtilityGPU.h
@@ -0,0 +1,96 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PSMUtilityGPU.cuh
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "gpu/FieldAccessor.h"
+
+namespace walberla
+{
+namespace lbm_mesapd_coupling
+{
+namespace psm
+{
+namespace gpu
+{
+
+__device__ void cross(real_t* __restrict__ const crossResult, const real_t* __restrict__ const lhs,
+                      const real_t* __restrict__ const rhs)
+{
+   crossResult[0] = lhs[1] * rhs[2] - lhs[2] * rhs[1];
+   crossResult[1] = lhs[2] * rhs[0] - lhs[0] * rhs[2];
+   crossResult[2] = lhs[0] * rhs[1] - lhs[1] * rhs[0];
+}
+
+__device__ void getVelocityAtWFPoint(real_t* __restrict__ const velocityAtWFPoint,
+                                     const real_t* __restrict__ const linearVelocity,
+                                     const real_t* __restrict__ const angularVelocity,
+                                     const real_t* __restrict__ const position, const real_t* __restrict__ const wf_pt)
+{
+   real_t crossResult[3];
+   real_t rhs[] = { wf_pt[0] - position[0], wf_pt[1] - position[1], wf_pt[2] - position[2] };
+   cross(crossResult, angularVelocity, rhs);
+   velocityAtWFPoint[0] = linearVelocity[0] + crossResult[0];
+   velocityAtWFPoint[1] = linearVelocity[1] + crossResult[1];
+   velocityAtWFPoint[2] = linearVelocity[2] + crossResult[2];
+}
+
+__device__ void addHydrodynamicForceTorqueAtWFPosAtomic(real_t* __restrict__ const particleForce,
+                                                        real_t* __restrict__ const particleTorque,
+                                                        const real_t* __restrict__ const f,
+                                                        const real_t* __restrict__ const pos,
+                                                        const real_t* __restrict__ const wf_pt)
+{
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   atomicAdd(&(particleForce[0]), f[0]);
+   atomicAdd(&(particleForce[1]), f[1]);
+   atomicAdd(&(particleForce[2]), f[2]);
+#endif
+
+   // Using unsafeAtomicAdd ensures that HW FP Atomics are used instead of CAS loops, see:
+   // https://fs.hlrs.de/projects/par/events/2023/GPU-AMD/day3/11.%20AMD_Node_Memory_Model.pdf
+#ifdef WALBERLA_BUILD_WITH_HIP
+   unsafeAtomicAdd(&(particleForce[0]), f[0]);
+   unsafeAtomicAdd(&(particleForce[1]), f[1]);
+   unsafeAtomicAdd(&(particleForce[2]), f[2]);
+#endif
+
+   real_t torque[] = { 0.0, 0.0, 0.0 };
+   real_t lhs[]    = { wf_pt[0] - pos[0], wf_pt[1] - pos[1], wf_pt[2] - pos[2] };
+   cross(torque, lhs, f);
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   atomicAdd(&(particleTorque[0]), torque[0]);
+   atomicAdd(&(particleTorque[1]), torque[1]);
+   atomicAdd(&(particleTorque[2]), torque[2]);
+#endif
+
+#ifdef WALBERLA_BUILD_WITH_HIP
+   unsafeAtomicAdd(&(particleTorque[0]), torque[0]);
+   unsafeAtomicAdd(&(particleTorque[1]), torque[1]);
+   unsafeAtomicAdd(&(particleTorque[2]), torque[2]);
+#endif
+}
+
+} // namespace gpu
+} // namespace psm
+} // namespace lbm_mesapd_coupling
+} // namespace walberla
diff --git a/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMWrapperKernels.cpp b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMWrapperKernels.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4bbf4439f56f5e3acae17bef0cc81742437ef5be
--- /dev/null
+++ b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMWrapperKernels.cpp
@@ -0,0 +1,23 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PSMWrapperKernels.cpp
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//! \brief Mirror PSMWrapperKernels.cu to provide a .cpp file for HIP
+//
+//======================================================================================================================
+
+#include "PSMWrapperKernels.cu"
diff --git a/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMWrapperKernels.cu b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMWrapperKernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9aacf68d450bc8a6dbd2f8c5288f6116eaed2e0b
--- /dev/null
+++ b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMWrapperKernels.cu
@@ -0,0 +1,101 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PSMWrapperKernels.cu
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//! \brief Provide two kernels that need to be called before and after the PSM kernel
+//
+//======================================================================================================================
+
+#include "PSMUtilityGPU.h"
+#include "PSMWrapperKernels.h"
+
+namespace walberla
+{
+namespace lbm_mesapd_coupling
+{
+namespace psm
+{
+namespace gpu
+{
+
+__global__ void SetParticleVelocities(walberla::gpu::FieldAccessor< uint_t > nOverlappingParticlesField,
+                                      walberla::gpu::FieldAccessor< uint_t > idxField,
+                                      walberla::gpu::FieldAccessor< real_t > particleVelocitiesField,
+                                      real_t* __restrict__ const linearVelocities,
+                                      real_t* __restrict__ const angularVelocities,
+                                      real_t* __restrict__ const positions, const double3 blockStart, const real_t dx)
+{
+   const uint3 blockIdx_uint3  = make_uint3(blockIdx.x, blockIdx.y, blockIdx.z);
+   const uint3 threadIdx_uint3 = make_uint3(threadIdx.x, threadIdx.y, threadIdx.z);
+
+   nOverlappingParticlesField.set(blockIdx_uint3, threadIdx_uint3);
+   idxField.set(blockIdx_uint3, threadIdx_uint3);
+   particleVelocitiesField.set(blockIdx_uint3, threadIdx_uint3);
+
+   // Cell center is needed in order to compute the particle velocity at this WF point
+   const real_t cellCenter[] = { (blockStart.x + (threadIdx.x + 0.5) * dx), (blockStart.y + (blockIdx.x + 0.5) * dx),
+                                 (blockStart.z + (blockIdx.y + 0.5) * dx) };
+
+   // Compute the particle velocity at the cell center for all overlapping particles
+   for (uint_t p = 0; p < nOverlappingParticlesField.get(); p++)
+   {
+      real_t particleVelocityAtWFPoint[] = { 0.0, 0.0, 0.0 };
+      getVelocityAtWFPoint(particleVelocityAtWFPoint, &linearVelocities[idxField.get(p) * 3],
+                           &angularVelocities[idxField.get(p) * 3], &positions[idxField.get(p) * 3], cellCenter);
+      particleVelocitiesField.get(p * 3 + 0) = particleVelocityAtWFPoint[0];
+      particleVelocitiesField.get(p * 3 + 1) = particleVelocityAtWFPoint[1];
+      particleVelocitiesField.get(p * 3 + 2) = particleVelocityAtWFPoint[2];
+   }
+}
+
+__global__ void ReduceParticleForces(walberla::gpu::FieldAccessor< uint_t > nOverlappingParticlesField,
+                                     walberla::gpu::FieldAccessor< id_t > idxField,
+                                     walberla::gpu::FieldAccessor< real_t > particleForcesField,
+                                     real_t* __restrict__ const hydrodynamicForces,
+                                     real_t* __restrict__ const hydrodynamicTorques,
+                                     real_t* __restrict__ const positions, const double3 blockStart, const real_t dx,
+                                     const real_t forceScalingFactor)
+{
+   const uint3 blockIdx_uint3  = make_uint3(blockIdx.x, blockIdx.y, blockIdx.z);
+   const uint3 threadIdx_uint3 = make_uint3(threadIdx.x, threadIdx.y, threadIdx.z);
+
+   nOverlappingParticlesField.set(blockIdx_uint3, threadIdx_uint3);
+   idxField.set(blockIdx_uint3, threadIdx_uint3);
+   particleForcesField.set(blockIdx_uint3, threadIdx_uint3);
+
+   // Cell center is needed in order to compute the particle velocity at this WF point
+   const real_t cellCenter[] = { (blockStart.x + (threadIdx.x + 0.5) * dx), (blockStart.y + (blockIdx.x + 0.5) * dx),
+                                 (blockStart.z + (blockIdx.y + 0.5) * dx) };
+
+   // Reduce the forces for all overlapping particles
+   for (uint_t p = 0; p < nOverlappingParticlesField.get(); p++)
+   {
+      real_t forceOnParticle[] = { particleForcesField.get(p * 3 + 0), particleForcesField.get(p * 3 + 1),
+                                   particleForcesField.get(p * 3 + 2) };
+      forceOnParticle[0] *= forceScalingFactor;
+      forceOnParticle[1] *= forceScalingFactor;
+      forceOnParticle[2] *= forceScalingFactor;
+      addHydrodynamicForceTorqueAtWFPosAtomic(&hydrodynamicForces[idxField.get(p) * 3],
+                                              &hydrodynamicTorques[idxField.get(p) * 3], forceOnParticle,
+                                              &positions[idxField.get(p) * 3], cellCenter);
+   }
+}
+
+} // namespace gpu
+} // namespace psm
+} // namespace lbm_mesapd_coupling
+} // namespace walberla
diff --git a/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMWrapperKernels.h b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMWrapperKernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3afc21c7c6d387883e25185ed7a67f7cb03f511
--- /dev/null
+++ b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMWrapperKernels.h
@@ -0,0 +1,54 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PSMWrapperKernels.h
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "gpu/FieldAccessor.h"
+
+namespace walberla
+{
+namespace lbm_mesapd_coupling
+{
+namespace psm
+{
+namespace gpu
+{
+
+__global__ void SetParticleVelocities(walberla::gpu::FieldAccessor< uint_t > nOverlappingParticlesField,
+                                      walberla::gpu::FieldAccessor< uint_t > idxField,
+                                      walberla::gpu::FieldAccessor< real_t > particleVelocitiesField,
+                                      real_t* __restrict__ const linearVelocities,
+                                      real_t* __restrict__ const angularVelocities,
+                                      real_t* __restrict__ const positions, const double3 blockStart,
+                                      const real_t dx);
+
+__global__ void ReduceParticleForces(walberla::gpu::FieldAccessor< uint_t > nOverlappingParticlesField,
+                                     walberla::gpu::FieldAccessor< id_t > idxField,
+                                     walberla::gpu::FieldAccessor< real_t > particleForcesField,
+                                     real_t* __restrict__ const hydrodynamicForces,
+                                     real_t* __restrict__ const hydrodynamicTorques,
+                                     real_t* __restrict__ const positions, const double3 blockStart, const real_t dx,
+                                     const real_t forceScalingFactor);
+
+} // namespace gpu
+} // namespace psm
+} // namespace lbm_mesapd_coupling
+} // namespace walberla
diff --git a/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMWrapperSweepsCPU.h b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMWrapperSweepsCPU.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4ab73dd4533dd24d0aec203fa772fe734ce08e7
--- /dev/null
+++ b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMWrapperSweepsCPU.h
@@ -0,0 +1,250 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PSMWrapperSweepsCPU.h
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "domain_decomposition/StructuredBlockStorage.h"
+
+#include "field/GhostLayerField.h"
+
+#include "lbm/sweeps/StreamPull.h"
+#include "lbm/sweeps/SweepBase.h"
+
+#include "lbm_mesapd_coupling/DataTypesCodegen.h"
+#include "lbm_mesapd_coupling/utility/ParticleFunctions.h"
+
+#include "mesa_pd/common/ParticleFunctions.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include <cassert>
+
+namespace walberla
+{
+namespace lbm_mesapd_coupling
+{
+namespace psm
+{
+namespace gpu
+{
+
+template< typename ParticleAccessor_T, typename ParticleSelector_T, int Weighting_T >
+class SetParticleVelocitiesSweep
+{
+ public:
+   SetParticleVelocitiesSweep(const shared_ptr< StructuredBlockStorage >& bs,
+                              const shared_ptr< ParticleAccessor_T >& ac,
+                              const ParticleSelector_T& mappingParticleSelector,
+                              ParticleAndVolumeFractionSoA_T< Weighting_T >& particleAndVolumeFractionSoA)
+      : bs_(bs), ac_(ac), mappingParticleSelector_(mappingParticleSelector),
+        particleAndVolumeFractionSoA_(particleAndVolumeFractionSoA)
+   {}
+   void operator()(IBlock* block)
+   {
+      // Check that uids of the particles have not changed since the last mapping to avoid incorrect indices
+      std::vector< walberla::id_t > currentUIDs;
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_)) { currentUIDs.push_back(ac_->getUid(idx)); }
+      }
+      WALBERLA_ASSERT(particleAndVolumeFractionSoA_.mappingUIDs == currentUIDs);
+
+      size_t numMappedParticles = 0;
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_)) { numMappedParticles++; }
+      }
+
+      if (numMappedParticles == uint_t(0)) return;
+
+      size_t arraySizes = numMappedParticles * sizeof(real_t) * 3;
+
+      // Allocate unified memory for the particle information required for computing the velocity at a WF point (used in
+      // the solid collision operator)
+      real_t* linearVelocities = (real_t*) malloc(arraySizes);
+      memset(linearVelocities, 0, arraySizes);
+      real_t* angularVelocities = (real_t*) malloc(arraySizes);
+      memset(angularVelocities, 0, arraySizes);
+
+      // Store particle information inside memory
+      size_t idxMapped = 0;
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_))
+         {
+            for (size_t d = 0; d < 3; ++d)
+            {
+               linearVelocities[idxMapped * 3 + d]  = ac_->getLinearVelocity(idx)[d];
+               angularVelocities[idxMapped * 3 + d] = ac_->getAngularVelocity(idx)[d];
+            }
+            idxMapped++;
+         }
+      }
+
+      auto nOverlappingParticlesField =
+         block->getData< nOverlappingParticlesField_T >(particleAndVolumeFractionSoA_.nOverlappingParticlesFieldID);
+      auto idxField = block->getData< idxField_T >(particleAndVolumeFractionSoA_.idxFieldID);
+      auto particleVelocitiesField =
+         block->getData< particleVelocitiesField_T >(particleAndVolumeFractionSoA_.particleVelocitiesFieldID);
+
+      // For every cell, compute the particle velocities of the overlapping particles evaluated at the cell center
+      const real_t dx = block->getAABB().xSize() / real_t(nOverlappingParticlesField->xSize());
+      WALBERLA_FOR_ALL_CELLS_XYZ(
+         particleVelocitiesField, const Vector3< real_t > cellCenter =
+                                     Vector3< real_t >(real_t(x) + real_t(0.5) * dx, real_t(y) + real_t(0.5) * dx,
+                                                       real_t(z) + real_t(0.5) * dx) +
+                                     block->getAABB().minCorner();
+         for (uint_t p = 0; p < nOverlappingParticlesField->get(x, y, z); p++) {
+            Vector3< real_t > particleVelocityAtWFPoint =
+               Vector3< real_t >(linearVelocities[idxField->get(x, y, z, p) * 3 + 0],
+                                 linearVelocities[idxField->get(x, y, z, p) * 3 + 1],
+                                 linearVelocities[idxField->get(x, y, z, p) * 3 + 2]) +
+               cross(Vector3< real_t >(angularVelocities[idxField->get(x, y, z, p) * 3 + 0],
+                                       angularVelocities[idxField->get(x, y, z, p) * 3 + 1],
+                                       angularVelocities[idxField->get(x, y, z, p) * 3 + 2]),
+                     Vector3< real_t >(
+                        cellCenter[0] - particleAndVolumeFractionSoA_.positions[idxField->get(x, y, z, p) * 3 + 0],
+                        cellCenter[1] - particleAndVolumeFractionSoA_.positions[idxField->get(x, y, z, p) * 3 + 1],
+                        cellCenter[2] - particleAndVolumeFractionSoA_.positions[idxField->get(x, y, z, p) * 3 + 2]));
+            particleVelocitiesField->get(x, y, z, p * 3 + 0) = particleVelocityAtWFPoint[0];
+            particleVelocitiesField->get(x, y, z, p * 3 + 1) = particleVelocityAtWFPoint[1];
+            particleVelocitiesField->get(x, y, z, p * 3 + 2) = particleVelocityAtWFPoint[2];
+         })
+
+      free(linearVelocities);
+      free(angularVelocities);
+   }
+
+ private:
+   shared_ptr< StructuredBlockStorage > bs_;
+   const shared_ptr< ParticleAccessor_T > ac_;
+   const ParticleSelector_T& mappingParticleSelector_;
+   ParticleAndVolumeFractionSoA_T< Weighting_T >& particleAndVolumeFractionSoA_;
+};
+
+template< typename ParticleAccessor_T, typename ParticleSelector_T, int Weighting_T >
+class ReduceParticleForcesSweep
+{
+ public:
+   ReduceParticleForcesSweep(const shared_ptr< StructuredBlockStorage >& bs, const shared_ptr< ParticleAccessor_T >& ac,
+                             const ParticleSelector_T& mappingParticleSelector,
+                             const ParticleAndVolumeFractionSoA_T< Weighting_T >& particleAndVolumeFractionSoA)
+      : bs_(bs), ac_(ac), mappingParticleSelector_(mappingParticleSelector),
+        particleAndVolumeFractionSoA_(particleAndVolumeFractionSoA)
+   {}
+   void operator()(IBlock* block)
+   {
+      // Check that uids of the particles have not changed since the last mapping to avoid incorrect indices
+      std::vector< walberla::id_t > currentUIDs;
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_)) { currentUIDs.push_back(ac_->getUid(idx)); }
+      }
+      WALBERLA_ASSERT(particleAndVolumeFractionSoA_.mappingUIDs == currentUIDs);
+
+      const real_t dxCurrentLevel      = bs_->dx(bs_->getLevel(*block));
+      const real_t lengthScalingFactor = dxCurrentLevel;
+      const real_t forceScalingFactor  = lengthScalingFactor * lengthScalingFactor;
+
+      size_t numMappedParticles = 0;
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_)) { numMappedParticles++; }
+      }
+
+      if (numMappedParticles == uint_t(0)) return;
+
+      size_t arraySizes = numMappedParticles * sizeof(real_t) * 3;
+
+      // Allocate memory for the reduction of the particle forces and torques
+      real_t* hydrodynamicForces = (real_t*) malloc(arraySizes);
+      memset(hydrodynamicForces, 0, arraySizes);
+      real_t* hydrodynamicTorques = (real_t*) malloc(arraySizes);
+      memset(hydrodynamicTorques, 0, arraySizes);
+
+      auto nOverlappingParticlesField =
+         block->getData< nOverlappingParticlesField_T >(particleAndVolumeFractionSoA_.nOverlappingParticlesFieldID);
+      auto idxField = block->getData< idxField_T >(particleAndVolumeFractionSoA_.idxFieldID);
+      auto particleForcesField =
+         block->getData< particleForcesField_T >(particleAndVolumeFractionSoA_.particleForcesFieldID);
+
+      // For every cell, reduce the hydrodynamic forces and torques of the overlapping particles
+      const real_t dx = block->getAABB().xSize() / real_t(nOverlappingParticlesField->xSize());
+      WALBERLA_FOR_ALL_CELLS_XYZ(
+         particleForcesField, const Vector3< real_t > cellCenter =
+                                 Vector3< real_t >(real_t(x) + real_t(0.5) * dx, real_t(y) + real_t(0.5) * dx,
+                                                   real_t(z) + real_t(0.5) * dx) +
+                                 block->getAABB().minCorner();
+         for (uint_t p = 0; p < nOverlappingParticlesField->get(x, y, z); p++) {
+            Vector3< real_t > forceOnParticle(particleForcesField->get(x, y, z, p * 3 + 0),
+                                              particleForcesField->get(x, y, z, p * 3 + 1),
+                                              particleForcesField->get(x, y, z, p * 3 + 2));
+            forceOnParticle[0] *= forceScalingFactor;
+            forceOnParticle[1] *= forceScalingFactor;
+            forceOnParticle[2] *= forceScalingFactor;
+
+            hydrodynamicForces[idxField->get(x, y, z, p) * 3 + 0] += forceOnParticle[0];
+            hydrodynamicForces[idxField->get(x, y, z, p) * 3 + 1] += forceOnParticle[1];
+            hydrodynamicForces[idxField->get(x, y, z, p) * 3 + 2] += forceOnParticle[2];
+            Vector3< real_t > torqueOnParticle =
+               cross(Vector3< real_t >(
+                        cellCenter[0] - particleAndVolumeFractionSoA_.positions[idxField->get(x, y, z, p) * 3 + 0],
+                        cellCenter[1] - particleAndVolumeFractionSoA_.positions[idxField->get(x, y, z, p) * 3 + 1],
+                        cellCenter[2] - particleAndVolumeFractionSoA_.positions[idxField->get(x, y, z, p) * 3 + 2]),
+                     forceOnParticle);
+
+            hydrodynamicTorques[idxField->get(x, y, z, p) * 3 + 0] += torqueOnParticle[0];
+            hydrodynamicTorques[idxField->get(x, y, z, p) * 3 + 1] += torqueOnParticle[1];
+            hydrodynamicTorques[idxField->get(x, y, z, p) * 3 + 2] += torqueOnParticle[2];
+         }
+
+      )
+
+      // Copy forces and torques of particles
+      size_t idxMapped = 0;
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_))
+         {
+            for (size_t d = 0; d < 3; ++d)
+            {
+               ac_->getHydrodynamicForceRef(idx)[d] += hydrodynamicForces[idxMapped * 3 + d];
+               ac_->getHydrodynamicTorqueRef(idx)[d] += hydrodynamicTorques[idxMapped * 3 + d];
+            }
+            idxMapped++;
+         }
+      }
+
+      free(hydrodynamicForces);
+      free(hydrodynamicTorques);
+   }
+
+ private:
+   shared_ptr< StructuredBlockStorage > bs_;
+   const shared_ptr< ParticleAccessor_T > ac_;
+   const ParticleSelector_T& mappingParticleSelector_;
+   const ParticleAndVolumeFractionSoA_T< Weighting_T >& particleAndVolumeFractionSoA_;
+};
+
+} // namespace gpu
+} // namespace psm
+} // namespace lbm_mesapd_coupling
+} // namespace walberla
diff --git a/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMWrapperSweepsGPU.h b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMWrapperSweepsGPU.h
new file mode 100644
index 0000000000000000000000000000000000000000..863d13fbfa2108668608c83bed2eb2f9ab463f84
--- /dev/null
+++ b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMWrapperSweepsGPU.h
@@ -0,0 +1,253 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PSMWrapperSweepsGPU.h
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "domain_decomposition/StructuredBlockStorage.h"
+
+#include "field/GhostLayerField.h"
+
+#include "gpu/FieldIndexing.h"
+#include "gpu/GPUField.h"
+#include "gpu/Kernel.h"
+#include "gpu/sweeps/GPUSweepBase.h"
+
+#include "lbm/sweeps/StreamPull.h"
+#include "lbm/sweeps/SweepBase.h"
+
+#include "lbm_mesapd_coupling/DataTypesCodegen.h"
+#include "lbm_mesapd_coupling/utility/ParticleFunctions.h"
+
+#include "mesa_pd/common/ParticleFunctions.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include <cassert>
+
+#include "PSMWrapperKernels.h"
+
+namespace walberla
+{
+namespace lbm_mesapd_coupling
+{
+namespace psm
+{
+namespace gpu
+{
+
+template< typename ParticleAccessor_T, typename ParticleSelector_T, int Weighting_T >
+class SetParticleVelocitiesSweep
+{
+ public:
+   SetParticleVelocitiesSweep(const shared_ptr< StructuredBlockStorage >& bs,
+                              const shared_ptr< ParticleAccessor_T >& ac,
+                              const ParticleSelector_T& mappingParticleSelector,
+                              ParticleAndVolumeFractionSoA_T< Weighting_T >& particleAndVolumeFractionSoA)
+      : bs_(bs), ac_(ac), mappingParticleSelector_(mappingParticleSelector),
+        particleAndVolumeFractionSoA_(particleAndVolumeFractionSoA)
+   {}
+   void operator()(IBlock* block)
+   {
+      // Check that uids of the particles have not changed since the last mapping to avoid incorrect indices
+      std::vector< walberla::id_t > currentUIDs;
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_)) { currentUIDs.push_back(ac_->getUid(idx)); }
+      }
+      WALBERLA_ASSERT(particleAndVolumeFractionSoA_.mappingUIDs == currentUIDs);
+
+      size_t numMappedParticles = 0;
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_)) { numMappedParticles++; }
+      }
+
+      if (numMappedParticles == uint_t(0)) return;
+
+      size_t arraySizes = numMappedParticles * sizeof(real_t) * 3;
+
+      // Allocate memory for the particle information required for computing the velocity at a WF point (used in
+      // the solid collision operator)
+      real_t* linearVelocities_h  = (real_t*) malloc(arraySizes);
+      real_t* angularVelocities_h = (real_t*) malloc(arraySizes);
+
+      // Store particle information inside memory to communicate information to the GPU
+      size_t idxMapped = 0;
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_))
+         {
+            for (size_t d = 0; d < 3; ++d)
+            {
+               linearVelocities_h[idxMapped * 3 + d]  = ac_->getLinearVelocity(idx)[d];
+               angularVelocities_h[idxMapped * 3 + d] = ac_->getAngularVelocity(idx)[d];
+            }
+            idxMapped++;
+         }
+      }
+
+      real_t* linearVelocities;
+      WALBERLA_GPU_CHECK(gpuMalloc(&linearVelocities, arraySizes));
+      WALBERLA_GPU_CHECK(gpuMemcpy(linearVelocities, linearVelocities_h, arraySizes, gpuMemcpyHostToDevice));
+      real_t* angularVelocities;
+      WALBERLA_GPU_CHECK(gpuMalloc(&angularVelocities, arraySizes));
+      WALBERLA_GPU_CHECK(gpuMemcpy(angularVelocities, angularVelocities_h, arraySizes, gpuMemcpyHostToDevice));
+
+      auto nOverlappingParticlesField =
+         block->getData< nOverlappingParticlesFieldGPU_T >(particleAndVolumeFractionSoA_.nOverlappingParticlesFieldID);
+      auto idxField = block->getData< idxFieldGPU_T >(particleAndVolumeFractionSoA_.idxFieldID);
+      auto particleVelocitiesField =
+         block->getData< particleVelocitiesFieldGPU_T >(particleAndVolumeFractionSoA_.particleVelocitiesFieldID);
+
+      // For every cell, compute the particle velocities of the overlapping particles evaluated at the cell center
+      auto velocitiesKernel = walberla::gpu::make_kernel(&(SetParticleVelocities));
+      velocitiesKernel.addFieldIndexingParam(walberla::gpu::FieldIndexing< uint_t >::xyz(*nOverlappingParticlesField));
+      velocitiesKernel.addFieldIndexingParam(walberla::gpu::FieldIndexing< id_t >::xyz(*idxField));
+      velocitiesKernel.addFieldIndexingParam(walberla::gpu::FieldIndexing< real_t >::xyz(*particleVelocitiesField));
+      velocitiesKernel.addParam(linearVelocities);
+      velocitiesKernel.addParam(angularVelocities);
+      velocitiesKernel.addParam(particleAndVolumeFractionSoA_.positions);
+      const double3 blockStart = { block->getAABB().minCorner()[0], block->getAABB().minCorner()[1],
+                                   block->getAABB().minCorner()[2] };
+      velocitiesKernel.addParam(blockStart);
+      velocitiesKernel.addParam(block->getAABB().xSize() / real_t(nOverlappingParticlesField->xSize()));
+      velocitiesKernel();
+
+      WALBERLA_GPU_CHECK(gpuFree(linearVelocities));
+      free(linearVelocities_h);
+
+      WALBERLA_GPU_CHECK(gpuFree(angularVelocities));
+      free(angularVelocities_h);
+   }
+
+ private:
+   shared_ptr< StructuredBlockStorage > bs_;
+   const shared_ptr< ParticleAccessor_T > ac_;
+   const ParticleSelector_T& mappingParticleSelector_;
+   ParticleAndVolumeFractionSoA_T< Weighting_T >& particleAndVolumeFractionSoA_;
+};
+
+template< typename ParticleAccessor_T, typename ParticleSelector_T, int Weighting_T >
+class ReduceParticleForcesSweep
+{
+ public:
+   ReduceParticleForcesSweep(const shared_ptr< StructuredBlockStorage >& bs, const shared_ptr< ParticleAccessor_T >& ac,
+                             const ParticleSelector_T& mappingParticleSelector,
+                             const ParticleAndVolumeFractionSoA_T< Weighting_T >& particleAndVolumeFractionSoA)
+      : bs_(bs), ac_(ac), mappingParticleSelector_(mappingParticleSelector),
+        particleAndVolumeFractionSoA_(particleAndVolumeFractionSoA)
+   {}
+   void operator()(IBlock* block)
+   {
+      // Check that uids of the particles have not changed since the last mapping to avoid incorrect indices
+      std::vector< walberla::id_t > currentUIDs;
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_)) { currentUIDs.push_back(ac_->getUid(idx)); }
+      }
+      WALBERLA_ASSERT(particleAndVolumeFractionSoA_.mappingUIDs == currentUIDs);
+
+      const real_t dxCurrentLevel      = bs_->dx(bs_->getLevel(*block));
+      const real_t lengthScalingFactor = dxCurrentLevel;
+      const real_t forceScalingFactor  = lengthScalingFactor * lengthScalingFactor;
+
+      size_t numMappedParticles = 0;
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_)) { numMappedParticles++; }
+      }
+
+      if (numMappedParticles == uint_t(0)) return;
+
+      size_t arraySizes = numMappedParticles * sizeof(real_t) * 3;
+
+      // Allocate memory for the reduction of the particle forces and torques on the GPU
+      real_t* hydrodynamicForces;
+      WALBERLA_GPU_CHECK(gpuMalloc(&hydrodynamicForces, arraySizes));
+      WALBERLA_GPU_CHECK(gpuMemset(hydrodynamicForces, 0, arraySizes));
+
+      real_t* hydrodynamicTorques;
+      WALBERLA_GPU_CHECK(gpuMalloc(&hydrodynamicTorques, arraySizes));
+      WALBERLA_GPU_CHECK(gpuMemset(hydrodynamicTorques, 0, arraySizes));
+
+      auto nOverlappingParticlesField =
+         block->getData< nOverlappingParticlesFieldGPU_T >(particleAndVolumeFractionSoA_.nOverlappingParticlesFieldID);
+      auto idxField = block->getData< idxFieldGPU_T >(particleAndVolumeFractionSoA_.idxFieldID);
+      auto particleForcesField =
+         block->getData< particleForcesFieldGPU_T >(particleAndVolumeFractionSoA_.particleForcesFieldID);
+
+      const double3 blockStart = { block->getAABB().minCorner()[0], block->getAABB().minCorner()[1],
+                                   block->getAABB().minCorner()[2] };
+
+      // For every cell, reduce the hydrodynamic forces and torques of the overlapping particles
+      auto forcesKernel = walberla::gpu::make_kernel(&(ReduceParticleForces));
+      forcesKernel.addFieldIndexingParam(walberla::gpu::FieldIndexing< uint_t >::xyz(*nOverlappingParticlesField));
+      forcesKernel.addFieldIndexingParam(walberla::gpu::FieldIndexing< id_t >::xyz(*idxField));
+      forcesKernel.addFieldIndexingParam(walberla::gpu::FieldIndexing< real_t >::xyz(*particleForcesField));
+      forcesKernel.addParam(hydrodynamicForces);
+      forcesKernel.addParam(hydrodynamicTorques);
+      forcesKernel.addParam(particleAndVolumeFractionSoA_.positions);
+      forcesKernel.addParam(blockStart);
+      forcesKernel.addParam(block->getAABB().xSize() / real_t(nOverlappingParticlesField->xSize()));
+      forcesKernel.addParam(forceScalingFactor);
+      forcesKernel();
+
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize());
+
+      real_t* hydrodynamicForces_h = (real_t*) malloc(arraySizes);
+      WALBERLA_GPU_CHECK(gpuMemcpy(hydrodynamicForces_h, hydrodynamicForces, arraySizes, gpuMemcpyDeviceToHost));
+
+      real_t* hydrodynamicTorques_h = (real_t*) malloc(arraySizes);
+      WALBERLA_GPU_CHECK(gpuMemcpy(hydrodynamicTorques_h, hydrodynamicTorques, arraySizes, gpuMemcpyDeviceToHost));
+
+      // Copy forces and torques of particles from GPU to CPU
+      size_t idxMapped = 0;
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_))
+         {
+            for (size_t d = 0; d < 3; ++d)
+            {
+               ac_->getHydrodynamicForceRef(idx)[d] += hydrodynamicForces_h[idxMapped * 3 + d];
+               ac_->getHydrodynamicTorqueRef(idx)[d] += hydrodynamicTorques_h[idxMapped * 3 + d];
+            }
+            idxMapped++;
+         }
+      }
+
+      WALBERLA_GPU_CHECK(gpuFree(hydrodynamicForces));
+      free(hydrodynamicForces_h);
+
+      WALBERLA_GPU_CHECK(gpuFree(hydrodynamicTorques));
+      free(hydrodynamicTorques_h);
+   }
+
+ private:
+   shared_ptr< StructuredBlockStorage > bs_;
+   const shared_ptr< ParticleAccessor_T > ac_;
+   const ParticleSelector_T& mappingParticleSelector_;
+   const ParticleAndVolumeFractionSoA_T< Weighting_T >& particleAndVolumeFractionSoA_;
+};
+
+} // namespace gpu
+} // namespace psm
+} // namespace lbm_mesapd_coupling
+} // namespace walberla
diff --git a/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMappingKernels.cpp b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMappingKernels.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..43d8c9786c6153aad985528fc249a69897dd4451
--- /dev/null
+++ b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMappingKernels.cpp
@@ -0,0 +1,23 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ParticleAndVolumeFractionMappingKernels.cpp
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//! \brief Mirror ParticleAndVolumeFractionMappingKernels.cu to provide a .cpp file for HIP
+//
+//======================================================================================================================
+
+#include "ParticleAndVolumeFractionMappingKernels.cu"
diff --git a/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMappingKernels.cu b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMappingKernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fe22d4eb1dd4e1d75eeb7c8c5a7bd137a15a6b64
--- /dev/null
+++ b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMappingKernels.cu
@@ -0,0 +1,313 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ParticleAndVolumeFractionMappingKernels.cu
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//
+//======================================================================================================================
+
+#include "lbm_mesapd_coupling/DataTypesCodegen.h"
+
+#include <assert.h>
+
+#include "ParticleAndVolumeFractionMappingKernels.h"
+
+namespace walberla
+{
+namespace lbm_mesapd_coupling
+{
+namespace psm
+{
+namespace gpu
+{
+
+// Functions to calculate Bs
+template< int Weighting_T >
+__device__ void calculateWeighting(real_t* __restrict__ const weighting, const real_t& /*epsilon*/,
+                                   const real_t& /*tau*/)
+{
+   WALBERLA_STATIC_ASSERT(Weighting_T == 1 || Weighting_T == 2);
+}
+template<>
+__device__ void calculateWeighting< 1 >(real_t* __restrict__ const weighting, const real_t& epsilon,
+                                        const real_t& /*tau*/)
+{
+   *weighting = epsilon;
+}
+template<>
+__device__ void calculateWeighting< 2 >(real_t* __restrict__ const weighting, const real_t& epsilon, const real_t& tau)
+{
+   *weighting = epsilon * (tau - real_t(0.5)) / ((real_t(1) - epsilon) + (tau - real_t(0.5)));
+}
+
+template< int Weighting_T >
+__global__ void superSampling(walberla::gpu::FieldAccessor< uint_t > nOverlappingParticlesField,
+                              walberla::gpu::FieldAccessor< real_t > BsField,
+                              walberla::gpu::FieldAccessor< id_t > idxField,
+                              walberla::gpu::FieldAccessor< real_t > BField, const real_t omega,
+                              const real_t* __restrict__ const spherePositions,
+                              const real_t* __restrict__ const sphereRadii, const double3 blockStart, const real_t dx,
+                              const int3 nSamples, const size_t* __restrict__ const numParticlesSubBlocks,
+                              const size_t* __restrict__ const particleIDsSubBlocks, const uint3 subBlocksPerDim)
+{
+   const uint3 blockIdx_uint3  = make_uint3(blockIdx.x, blockIdx.y, blockIdx.z);
+   const uint3 threadIdx_uint3 = make_uint3(threadIdx.x, threadIdx.y, threadIdx.z);
+
+   nOverlappingParticlesField.set(blockIdx_uint3, threadIdx_uint3);
+   BsField.set(blockIdx_uint3, threadIdx_uint3);
+   idxField.set(blockIdx_uint3, threadIdx_uint3);
+   BField.set(blockIdx_uint3, threadIdx_uint3);
+
+   // Clear the fields
+   for (uint i = 0; i < MaxParticlesPerCell; i++)
+   {
+      BsField.get(i)  = real_t(0.0);
+      idxField.get(i) = size_t(0);
+   }
+   nOverlappingParticlesField.get() = uint_t(0);
+   BField.get()                     = real_t(0.0);
+
+   double3 sampleDistance = { 1.0 / (nSamples.x + 1) * dx, 1.0 / (nSamples.y + 1) * dx, 1.0 / (nSamples.z + 1) * dx };
+   double3 startSamplingPoint = { (blockStart.x + threadIdx.x * dx + sampleDistance.x),
+                                  (blockStart.y + blockIdx.x * dx + sampleDistance.y),
+                                  (blockStart.z + blockIdx.y * dx + sampleDistance.z) };
+   const ulong3 subBlockIndex = { size_t(real_t(threadIdx.x) / blockDim.x * real_t(subBlocksPerDim.x)),
+                                  size_t(real_t(blockIdx.x) / gridDim.x * real_t(subBlocksPerDim.y)),
+                                  size_t(real_t(blockIdx.y) / gridDim.y * real_t(subBlocksPerDim.z)) };
+   size_t linearizedSubBlockIndex =
+      subBlockIndex.z * subBlocksPerDim.x * subBlocksPerDim.y + subBlockIndex.y * subBlocksPerDim.x + subBlockIndex.x;
+
+   for (uint i = 0; i < numParticlesSubBlocks[linearizedSubBlockIndex]; i++)
+   {
+      // SoA
+      size_t idxMapped =
+         particleIDsSubBlocks[linearizedSubBlockIndex + i * subBlocksPerDim.x * subBlocksPerDim.y * subBlocksPerDim.z];
+      double3 currentSamplingPoint = startSamplingPoint;
+
+      double3 minCornerSphere = { spherePositions[idxMapped * 3] - sphereRadii[idxMapped],
+                                  spherePositions[idxMapped * 3 + 1] - sphereRadii[idxMapped],
+                                  spherePositions[idxMapped * 3 + 2] - sphereRadii[idxMapped] };
+      double3 maxCornerSphere = { spherePositions[idxMapped * 3] + sphereRadii[idxMapped],
+                                  spherePositions[idxMapped * 3 + 1] + sphereRadii[idxMapped],
+                                  spherePositions[idxMapped * 3 + 2] + sphereRadii[idxMapped] };
+
+      double overlapFraction = 0.0;
+
+      if (startSamplingPoint.x + dx > minCornerSphere.x && startSamplingPoint.x < maxCornerSphere.x &&
+          startSamplingPoint.y + dx > minCornerSphere.y && startSamplingPoint.y < maxCornerSphere.y &&
+          startSamplingPoint.z + dx > minCornerSphere.z && startSamplingPoint.z < maxCornerSphere.z)
+      {
+         for (uint_t z = 0; z < nSamples.z; z++)
+         {
+            currentSamplingPoint.y = startSamplingPoint.y;
+            for (uint_t y = 0; y < nSamples.y; y++)
+            {
+               currentSamplingPoint.x = startSamplingPoint.x;
+               for (uint_t x = 0; x < nSamples.x; x++)
+               {
+                  if ((currentSamplingPoint.x - spherePositions[idxMapped * 3]) *
+                            (currentSamplingPoint.x - spherePositions[idxMapped * 3]) +
+                         (currentSamplingPoint.y - spherePositions[idxMapped * 3 + 1]) *
+                            (currentSamplingPoint.y - spherePositions[idxMapped * 3 + 1]) +
+                         (currentSamplingPoint.z - spherePositions[idxMapped * 3 + 2]) *
+                            (currentSamplingPoint.z - spherePositions[idxMapped * 3 + 2]) <=
+                      sphereRadii[idxMapped] * sphereRadii[idxMapped])
+                  {
+                     overlapFraction += 1.0;
+                  }
+                  currentSamplingPoint.x += sampleDistance.x;
+               }
+               currentSamplingPoint.y += sampleDistance.y;
+            }
+            currentSamplingPoint.z += sampleDistance.z;
+         }
+
+         // store overlap fraction only if there is an intersection
+         if (overlapFraction > 0.0)
+         {
+            assert(nOverlappingParticlesField.get() < MaxParticlesPerCell);
+            BsField.get(nOverlappingParticlesField.get()) = overlapFraction;
+            BsField.get(nOverlappingParticlesField.get()) *= 1.0 / (nSamples.x * nSamples.y * nSamples.z);
+            calculateWeighting< Weighting_T >(&BsField.get(nOverlappingParticlesField.get()),
+                                              BsField.get(nOverlappingParticlesField.get()), real_t(1.0) / omega);
+            idxField.get(nOverlappingParticlesField.get()) = idxMapped;
+            BField.get() += BsField.get(nOverlappingParticlesField.get());
+            nOverlappingParticlesField.get() += 1;
+         }
+      }
+   }
+
+   // Normalize fraction field (Bs) if sum over all fractions (B) > 1
+   if (BField.get() > 1)
+   {
+      for (uint i = 0; i < nOverlappingParticlesField.get(); i++)
+      {
+         BsField.get(i) /= BField.get();
+      }
+      BField.get() = 1.0;
+   }
+}
+
+// Based on the following paper: https://doi.org/10.1108/EC-02-2016-0052
+template< int Weighting_T >
+__global__ void
+   linearApproximation(walberla::gpu::FieldAccessor< uint_t > nOverlappingParticlesField,
+                       walberla::gpu::FieldAccessor< real_t > BsField, walberla::gpu::FieldAccessor< id_t > idxField,
+                       walberla::gpu::FieldAccessor< real_t > BField, const real_t omega,
+                       const real_t* __restrict__ const spherePositions, const real_t* __restrict__ const sphereRadii,
+                       const real_t* __restrict__ const f_rs, const double3 blockStart, const real_t dx,
+                       const size_t* __restrict__ const numParticlesSubBlocks,
+                       const size_t* __restrict__ const particleIDsSubBlocks, const uint3 subBlocksPerDim)
+{
+   const uint3 blockIdx_uint3  = make_uint3(blockIdx.x, blockIdx.y, blockIdx.z);
+   const uint3 threadIdx_uint3 = make_uint3(threadIdx.x, threadIdx.y, threadIdx.z);
+
+   nOverlappingParticlesField.set(blockIdx_uint3, threadIdx_uint3);
+   BsField.set(blockIdx_uint3, threadIdx_uint3);
+   idxField.set(blockIdx_uint3, threadIdx_uint3);
+   BField.set(blockIdx_uint3, threadIdx_uint3);
+
+   // Clear the fields
+   for (uint i = 0; i < MaxParticlesPerCell; i++)
+   {
+      BsField.get(i)  = real_t(0.0);
+      idxField.get(i) = size_t(0);
+   }
+   nOverlappingParticlesField.get() = uint_t(0);
+   BField.get()                     = real_t(0.0);
+
+   const double3 cellCenter   = { (blockStart.x + (threadIdx.x + 0.5) * dx), (blockStart.y + (blockIdx.x + 0.5) * dx),
+                                  (blockStart.z + (blockIdx.y + 0.5) * dx) };
+   const ulong3 subBlockIndex = { size_t(real_t(threadIdx.x) / blockDim.x * real_t(subBlocksPerDim.x)),
+                                  size_t(real_t(blockIdx.x) / gridDim.x * real_t(subBlocksPerDim.y)),
+                                  size_t(real_t(blockIdx.y) / gridDim.y * real_t(subBlocksPerDim.z)) };
+   size_t linearizedSubBlockIndex =
+      subBlockIndex.z * subBlocksPerDim.x * subBlocksPerDim.y + subBlockIndex.y * subBlocksPerDim.x + subBlockIndex.x;
+
+   for (uint i = 0; i < numParticlesSubBlocks[linearizedSubBlockIndex]; i++)
+   {
+      size_t idxMapped =
+         particleIDsSubBlocks[linearizedSubBlockIndex + i * subBlocksPerDim.x * subBlocksPerDim.y * subBlocksPerDim.z];
+      double3 minCornerSphere = { spherePositions[idxMapped * 3] - sphereRadii[idxMapped],
+                                  spherePositions[idxMapped * 3 + 1] - sphereRadii[idxMapped],
+                                  spherePositions[idxMapped * 3 + 2] - sphereRadii[idxMapped] };
+      double3 maxCornerSphere = { spherePositions[idxMapped * 3] + sphereRadii[idxMapped],
+                                  spherePositions[idxMapped * 3 + 1] + sphereRadii[idxMapped],
+                                  spherePositions[idxMapped * 3 + 2] + sphereRadii[idxMapped] };
+      if (cellCenter.x + dx > minCornerSphere.x && cellCenter.x - dx < maxCornerSphere.x &&
+          cellCenter.y + dx > minCornerSphere.y && cellCenter.y - dx < maxCornerSphere.y &&
+          cellCenter.z + dx > minCornerSphere.z && cellCenter.z - dx < maxCornerSphere.z)
+      {
+         const double3 cellSphereVector = { spherePositions[idxMapped * 3] - cellCenter.x,
+                                            spherePositions[idxMapped * 3 + 1] - cellCenter.y,
+                                            spherePositions[idxMapped * 3 + 2] - cellCenter.z };
+
+         const real_t D = sqrt(cellSphereVector.x * cellSphereVector.x + cellSphereVector.y * cellSphereVector.y +
+                               cellSphereVector.z * cellSphereVector.z) -
+                          sphereRadii[idxMapped];
+
+         real_t epsilon = -D + f_rs[idxMapped];
+         epsilon        = max(epsilon, 0.0);
+         epsilon        = min(epsilon, 1.0);
+
+         // Store overlap fraction only if there is an intersection
+         if (epsilon > 0.0)
+         {
+            // Check that the maximum number of overlapping particles has not yet been reached
+            assert(nOverlappingParticlesField.get() < MaxParticlesPerCell);
+            BsField.get(nOverlappingParticlesField.get()) = epsilon;
+            calculateWeighting< Weighting_T >(&BsField.get(nOverlappingParticlesField.get()),
+                                              BsField.get(nOverlappingParticlesField.get()), real_t(1.0) / omega);
+            idxField.get(nOverlappingParticlesField.get()) = idxMapped;
+            BField.get() += BsField.get(nOverlappingParticlesField.get());
+            nOverlappingParticlesField.get() += 1;
+         }
+      }
+   }
+
+   // Normalize fraction field (Bs) if sum over all fractions (B) > 1
+   if (BField.get() > 1)
+   {
+      for (uint i = 0; i < nOverlappingParticlesField.get(); i++)
+      {
+         BsField.get(i) /= BField.get();
+      }
+      BField.get() = 1.0;
+   }
+}
+
+template< int Weighting_T >
+__global__ void boxMapping(walberla::gpu::FieldAccessor< uint_t > nOverlappingParticlesField,
+                           walberla::gpu::FieldAccessor< real_t > BsField,
+                           walberla::gpu::FieldAccessor< id_t > idxField, walberla::gpu::FieldAccessor< real_t > BField,
+                           const real_t omega, const double3 boxPositionMin, const double3 boxPositionMax,
+                           const double3 blockStart, const real_t dx, const id_t idxMapped)
+{
+   const uint3 blockIdx_uint3  = make_uint3(blockIdx.x, blockIdx.y, blockIdx.z);
+   const uint3 threadIdx_uint3 = make_uint3(threadIdx.x, threadIdx.y, threadIdx.z);
+
+   nOverlappingParticlesField.set(blockIdx_uint3, threadIdx_uint3);
+   BsField.set(blockIdx_uint3, threadIdx_uint3);
+   idxField.set(blockIdx_uint3, threadIdx_uint3);
+   BField.set(blockIdx_uint3, threadIdx_uint3);
+
+   const double3 cellCenter = { (blockStart.x + (threadIdx.x + 0.5) * dx), (blockStart.y + (blockIdx.x + 0.5) * dx),
+                                (blockStart.z + (blockIdx.y + 0.5) * dx) };
+   const double3 cellMin    = { cellCenter.x - dx * real_t(0.5), cellCenter.y - dx * real_t(0.5),
+                                cellCenter.z - dx * real_t(0.5) };
+   const double3 cellMax    = { cellCenter.x + dx * real_t(0.5), cellCenter.y + dx * real_t(0.5),
+                                cellCenter.z + dx * real_t(0.5) };
+
+   const real_t xOverlap        = max(real_t(0), min(boxPositionMax.x, cellMax.x) - max(boxPositionMin.x, cellMin.x));
+   const real_t yOverlap        = max(real_t(0), min(boxPositionMax.y, cellMax.y) - max(boxPositionMin.y, cellMin.y));
+   const real_t zOverlap        = max(real_t(0), min(boxPositionMax.z, cellMax.z) - max(boxPositionMin.z, cellMin.z));
+   const real_t overlapFraction = xOverlap * yOverlap * zOverlap / (dx * dx * dx);
+
+   if (overlapFraction > real_t(0))
+   {
+      assert(nOverlappingParticlesField.get() < MaxParticlesPerCell);
+
+      BsField.get(nOverlappingParticlesField.get()) = overlapFraction;
+      calculateWeighting< Weighting_T >(&BsField.get(nOverlappingParticlesField.get()),
+                                        BsField.get(nOverlappingParticlesField.get()), real_t(1.0) / omega);
+      idxField.get(nOverlappingParticlesField.get()) = idxMapped;
+      BField.get() += BsField.get(nOverlappingParticlesField.get());
+      nOverlappingParticlesField.get() += 1;
+
+      // TODO: it can happen that the BsField for spheres is normalized twice, one here and in the sphere mapping
+      // Normalize fraction field (Bs) if sum over all fractions (B) > 1
+      if (BField.get() > 1)
+      {
+         for (uint i = 0; i < nOverlappingParticlesField.get(); i++)
+         {
+            BsField.get(i) /= BField.get();
+         }
+         BField.get() = 1.0;
+      }
+   }
+}
+
+auto instance0_with_weighting_1 = superSampling< 1 >;
+auto instance1_with_weighting_2 = superSampling< 2 >;
+auto instance2_with_weighting_1 = linearApproximation< 1 >;
+auto instance3_with_weighting_2 = linearApproximation< 2 >;
+auto instance4_with_weighting_1 = boxMapping< 1 >;
+auto instance5_with_weighting_2 = boxMapping< 2 >;
+
+} // namespace gpu
+} // namespace psm
+} // namespace lbm_mesapd_coupling
+} // namespace walberla
diff --git a/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMappingKernels.h b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMappingKernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..32ee62c3bcc87d401f7cc0962e7a25fdf888f22c
--- /dev/null
+++ b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMappingKernels.h
@@ -0,0 +1,68 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ParticleAndVolumeFractionMappingKernels.h
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "core/DataTypes.h"
+#include "core/math/Vector3.h"
+
+#include "gpu/FieldAccessor.h"
+
+namespace walberla
+{
+namespace lbm_mesapd_coupling
+{
+namespace psm
+{
+namespace gpu
+{
+
+template< int Weighting_T >
+__global__ void superSampling(walberla::gpu::FieldAccessor< uint_t > nOverlappingParticlesField,
+                              walberla::gpu::FieldAccessor< real_t > BsField,
+                              walberla::gpu::FieldAccessor< id_t > idxField,
+                              walberla::gpu::FieldAccessor< real_t > BField, const real_t omega,
+                              const real_t* __restrict__ const spherePositions,
+                              const real_t* __restrict__ const sphereRadii, const double3 blockStart, const real_t dx,
+                              const int3 nSamples, const size_t* __restrict__ const numParticlesSubBlocks,
+                              const size_t* __restrict__ const particleIDsSubBlocks, const uint3 subBlocksPerDim);
+
+template< int Weighting_T >
+__global__ void
+   linearApproximation(walberla::gpu::FieldAccessor< uint_t > nOverlappingParticlesField,
+                       walberla::gpu::FieldAccessor< real_t > BsField, walberla::gpu::FieldAccessor< id_t > idxField,
+                       walberla::gpu::FieldAccessor< real_t > BField, const real_t omega,
+                       const real_t* __restrict__ const spherePositions, const real_t* __restrict__ const sphereRadii,
+                       const real_t* __restrict__ const f_rs, const double3 blockStart, const real_t dx,
+                       const size_t* __restrict__ const numParticlesSubBlocks,
+                       const size_t* __restrict__ const particleIDsSubBlocks, const uint3 subBlocksPerDim);
+
+template< int Weighting_T >
+__global__ void boxMapping(walberla::gpu::FieldAccessor< uint_t > nOverlappingParticlesField,
+                           walberla::gpu::FieldAccessor< real_t > BsField,
+                           walberla::gpu::FieldAccessor< id_t > idxField, walberla::gpu::FieldAccessor< real_t > BField,
+                           const real_t omega, const double3 boxPositionMin, const double3 boxPositionMax,
+                           const double3 blockStart, const real_t dx, const id_t idxMapped);
+
+} // namespace gpu
+} // namespace psm
+} // namespace lbm_mesapd_coupling
+} // namespace walberla
diff --git a/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMappingSweepsCPU.h b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMappingSweepsCPU.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ab47b344512d165b8e6c7d5718179f5725a3525
--- /dev/null
+++ b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMappingSweepsCPU.h
@@ -0,0 +1,318 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ParticleAndVolumeFractionMappingSweepsCPU.h
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "domain_decomposition/StructuredBlockStorage.h"
+
+#include "field/GhostLayerField.h"
+
+#include "lbm_mesapd_coupling/DataTypesCodegen.h"
+#include "lbm_mesapd_coupling/mapping/ParticleBoundingBox.h"
+#include "lbm_mesapd_coupling/utility/ParticleSelector.h"
+
+#include "mesa_pd/common/AABBConversion.h"
+#include "mesa_pd/data/ParticleAccessorWithShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/kernel/SingleCast.h"
+
+#include <cassert>
+#include <functional>
+#include <mesa_pd/data/ParticleStorage.h>
+#include <mesa_pd/data/shape/Sphere.h>
+
+namespace walberla
+{
+namespace lbm_mesapd_coupling
+{
+namespace psm
+{
+// The CPU version is on purpose in the gpu namespace to avoid changes in the application codes
+namespace gpu
+{
+
+template< int Weighting_T >
+void calculateWeighting(real_t* const weighting, const real_t& /*epsilon*/, const real_t& /*tau*/)
+{
+   WALBERLA_STATIC_ASSERT(Weighting_T == 1 || Weighting_T == 2);
+}
+
+template<>
+void calculateWeighting< 1 >(real_t* const weighting, const real_t& epsilon, const real_t& /*tau*/)
+{
+   *weighting = epsilon;
+}
+template<>
+void calculateWeighting< 2 >(real_t* const weighting, const real_t& epsilon, const real_t& tau)
+{
+   *weighting = epsilon * (tau - real_t(0.5)) / ((real_t(1) - epsilon) + (tau - real_t(0.5)));
+}
+
+template< int Weighting_T >
+void mapParticles(IBlock& blockIt, const ParticleAndVolumeFractionSoA_T< Weighting_T >& particleAndVolumeFractionSoA,
+                  const real_t* const spherePositions, const real_t* const sphereRadii, const real_t* const f_rs,
+                  const size_t* const numParticlesSubBlocks, const size_t* const particleIDsSubBlocks,
+                  const Vector3< uint_t > subBlocksPerDim)
+{
+   auto nOverlappingParticlesField =
+      blockIt.getData< nOverlappingParticlesField_T >(particleAndVolumeFractionSoA.nOverlappingParticlesFieldID);
+   auto BsField  = blockIt.getData< BsField_T >(particleAndVolumeFractionSoA.BsFieldID);
+   auto idxField = blockIt.getData< idxField_T >(particleAndVolumeFractionSoA.idxFieldID);
+   auto BField   = blockIt.getData< BField_T >(particleAndVolumeFractionSoA.BFieldID);
+
+   real_t dx = blockIt.getAABB().xSize() / real_t(nOverlappingParticlesField->xSize());
+
+   WALBERLA_FOR_ALL_CELLS_XYZ(
+      BField,
+      for (size_t i = 0; i < MaxParticlesPerCell; i++) {
+         BsField->get(x, y, z, i)  = real_t(0.0);
+         idxField->get(x, y, z, i) = size_t(0);
+      } nOverlappingParticlesField->get(x, y, z) = uint_t(0);
+      BField->get(x, y, z)                       = real_t(0.0);
+      const Vector3< real_t > cellCenter =
+         Vector3< real_t >(real_t(x) + real_t(0.5) * dx, real_t(y) + real_t(0.5) * dx, real_t(z) + real_t(0.5) * dx) +
+         blockIt.getAABB().minCorner();
+      const Vector3< size_t > subBlockIndex(size_t(real_t(x) / blockIt.getAABB().xSize() * real_t(subBlocksPerDim[0])),
+                                            size_t(real_t(y) / blockIt.getAABB().ySize() * real_t(subBlocksPerDim[1])),
+                                            size_t(real_t(z) / blockIt.getAABB().zSize() * real_t(subBlocksPerDim[2])));
+      const size_t linearizedSubBlockIndex = subBlockIndex[2] * subBlocksPerDim[0] * subBlocksPerDim[1] +
+                                             subBlockIndex[1] * subBlocksPerDim[0] + subBlockIndex[0];
+
+      for (size_t i = 0; i < numParticlesSubBlocks[linearizedSubBlockIndex]; i++) {
+         size_t idxMapped = particleIDsSubBlocks[linearizedSubBlockIndex +
+                                                 i * subBlocksPerDim[0] * subBlocksPerDim[1] * subBlocksPerDim[2]];
+         const Vector3< real_t > minCornerSphere(spherePositions[idxMapped * 3] - sphereRadii[idxMapped],
+                                                 spherePositions[idxMapped * 3 + 1] - sphereRadii[idxMapped],
+                                                 spherePositions[idxMapped * 3 + 2] - sphereRadii[idxMapped]);
+         const Vector3< real_t > maxCornerSphere(spherePositions[idxMapped * 3] + sphereRadii[idxMapped],
+                                                 spherePositions[idxMapped * 3 + 1] + sphereRadii[idxMapped],
+                                                 spherePositions[idxMapped * 3 + 2] + sphereRadii[idxMapped]);
+         if (cellCenter[0] + dx > minCornerSphere[0] && cellCenter[0] - dx < maxCornerSphere[0] &&
+             cellCenter[1] + dx > minCornerSphere[1] && cellCenter[1] - dx < maxCornerSphere[1] &&
+             cellCenter[2] + dx > minCornerSphere[2] && cellCenter[2] - dx < maxCornerSphere[2])
+         {
+            const Vector3< real_t > cellSphereVector(spherePositions[idxMapped * 3] - cellCenter[0],
+                                                     spherePositions[idxMapped * 3 + 1] - cellCenter[1],
+                                                     spherePositions[idxMapped * 3 + 2] - cellCenter[2]);
+
+            const real_t D =
+               real_t(sqrt(cellSphereVector[0] * cellSphereVector[0] + cellSphereVector[1] * cellSphereVector[1] +
+                           cellSphereVector[2] * cellSphereVector[2])) -
+               sphereRadii[idxMapped];
+
+            real_t epsilon = -D + f_rs[idxMapped];
+            epsilon        = std::max(epsilon, real_t(0));
+            epsilon        = std::min(epsilon, real_t(1));
+
+            // Store overlap fraction only if there is an intersection
+            if (epsilon > 0.0)
+            {
+               // Check that the maximum number of overlapping particles has not yet been reached
+               assert(nOverlappingParticlesField->get(x, y, z) < MaxParticlesPerCell);
+               BsField->get(x, y, z, nOverlappingParticlesField->get(x, y, z)) = epsilon;
+               calculateWeighting< Weighting_T >(&BsField->get(x, y, z, nOverlappingParticlesField->get(x, y, z)),
+                                                 BsField->get(x, y, z, nOverlappingParticlesField->get(x, y, z)),
+                                                 real_t(1.0) / particleAndVolumeFractionSoA.omega_);
+               idxField->get(x, y, z, nOverlappingParticlesField->get(x, y, z)) = idxMapped;
+               BField->get(x, y, z) += BsField->get(x, y, z, nOverlappingParticlesField->get(x, y, z));
+               nOverlappingParticlesField->get(x, y, z) += 1;
+            }
+         }
+      }
+
+      // Normalize fraction field (Bs) if sum over all fractions (B) > 1
+      if (BField->get(x, y, z) > 1) {
+         for (size_t i = 0; i < nOverlappingParticlesField->get(x, y, z); i++)
+         {
+            BsField->get(x, y, z, i) /= BField->get(x, y, z);
+         }
+         BField->get(x, y, z) = 1.0;
+      })
+}
+
+template< typename ParticleAccessor_T, typename ParticleSelector_T, int Weighting_T >
+class SphereFractionMappingSweep
+{
+ public:
+   SphereFractionMappingSweep(const shared_ptr< StructuredBlockStorage >& blockStorage,
+                              const shared_ptr< ParticleAccessor_T >& ac,
+                              const ParticleSelector_T& mappingParticleSelector,
+                              ParticleAndVolumeFractionSoA_T< Weighting_T >& particleAndVolumeFractionSoA,
+                              const Vector3< uint_t > subBlockSize)
+      : blockStorage_(blockStorage), ac_(ac), mappingParticleSelector_(mappingParticleSelector),
+        particleAndVolumeFractionSoA_(particleAndVolumeFractionSoA), subBlockSize_(subBlockSize)
+   {
+      static_assert(std::is_base_of< mesa_pd::data::IAccessor, ParticleAccessor_T >::value,
+                    "Provide a valid accessor as template");
+      for (auto blockIt = blockStorage_->begin(); blockIt != blockStorage_->end(); ++blockIt)
+      {
+         auto aabb = blockIt->getAABB();
+         if (size_t(aabb.xSize()) % subBlockSize_[0] != 0 || size_t(aabb.ySize()) % subBlockSize_[1] != 0 ||
+             size_t(aabb.zSize()) % subBlockSize_[2] != 0)
+         {
+            WALBERLA_ABORT("Number of cells per block (" << aabb << ") is not divisible by subBlockSize ("
+                                                         << subBlockSize_ << ").")
+         }
+      }
+   }
+
+   void operator()(IBlock* block)
+   {
+      size_t numMappedParticles = 0;
+      particleAndVolumeFractionSoA_.mappingUIDs.clear();
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_))
+         {
+            numMappedParticles++;
+            // Store UIDs to make sure that the particles have not changed between the mapping and the PSM sweep
+            particleAndVolumeFractionSoA_.mappingUIDs.push_back(ac_->getUid(idx));
+         }
+      }
+
+      if (numMappedParticles == uint_t(0)) return;
+
+      // Allocate memory storing the particle information needed for the overlap fraction computations
+      const size_t scalarArraySize = numMappedParticles * sizeof(real_t);
+
+      if (particleAndVolumeFractionSoA_.positions != nullptr) { free(particleAndVolumeFractionSoA_.positions); }
+      particleAndVolumeFractionSoA_.positions = (real_t*) malloc(3 * scalarArraySize);
+      real_t* radii                           = (real_t*) malloc(scalarArraySize);
+      real_t* f_r = (real_t*) malloc(scalarArraySize); // f_r is described in https://doi.org/10.1108/EC-02-2016-0052
+
+      // Store particle information inside the memory
+      size_t idxMapped = 0;
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_))
+         {
+            for (size_t d = 0; d < 3; ++d)
+            {
+               particleAndVolumeFractionSoA_.positions[idxMapped * 3 + d] = ac_->getPosition(idx)[d];
+            }
+            // If other shapes than spheres are mapped, ignore them here
+            if (ac_->getShape(idx)->getShapeType() == mesa_pd::data::Sphere::SHAPE_TYPE)
+            {
+               const real_t radius = static_cast< mesa_pd::data::Sphere* >(ac_->getShape(idx))->getRadius();
+               radii[idxMapped]    = radius;
+               real_t Va           = real_t(
+                  (1.0 / 12.0 - radius * radius) * atan((0.5 * sqrt(radius * radius - 0.5)) / (0.5 - radius * radius)) +
+                  1.0 / 3.0 * sqrt(radius * radius - 0.5) +
+                  (radius * radius - 1.0 / 12.0) * atan(0.5 / sqrt(radius * radius - 0.5)) -
+                  4.0 / 3.0 * radius * radius * radius * atan(0.25 / (radius * sqrt(radius * radius - 0.5))));
+               f_r[idxMapped] = Va - radius + real_t(0.5);
+            }
+            idxMapped++;
+         }
+      }
+
+      // Update fraction mapping
+      // Split the block into sub-blocks and sort the particle indices into each overlapping sub-block. This way, in
+      // the particle mapping, each iteration only has to check the potentially overlapping particles.
+      auto blockAABB = block->getAABB();
+      const Vector3< uint_t > subBlocksPerDim =
+         Vector3< uint_t >(uint_t(blockAABB.xSize()) / subBlockSize_[0], uint_t(blockAABB.ySize()) / subBlockSize_[1],
+                           uint_t(blockAABB.zSize()) / subBlockSize_[2]);
+      const size_t numSubBlocks = subBlocksPerDim[0] * subBlocksPerDim[1] * subBlocksPerDim[2];
+      std::vector< std::vector< size_t > > subBlocks(numSubBlocks);
+
+      idxMapped = 0;
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_))
+         {
+            if (ac_->getShape(idx)->getShapeType() == mesa_pd::data::Sphere::SHAPE_TYPE)
+            {
+               auto sphereAABB = mesa_pd::getParticleAABB(idx, *ac_);
+               if (blockAABB.intersects(sphereAABB))
+               {
+                  auto intersectionAABB = blockAABB.getIntersection(sphereAABB);
+                  intersectionAABB.translate(-blockAABB.minCorner());
+                  mesa_pd::Vec3 blockScaling = mesa_pd::Vec3(real_t(subBlocksPerDim[0]) / blockAABB.sizes()[0],
+                                                             real_t(subBlocksPerDim[1]) / blockAABB.sizes()[1],
+                                                             real_t(subBlocksPerDim[2]) / blockAABB.sizes()[2]);
+
+                  for (size_t z = size_t(intersectionAABB.zMin() * blockScaling[2]);
+                       z < size_t(ceil(intersectionAABB.zMax() * blockScaling[2])); ++z)
+                  {
+                     for (size_t y = size_t(intersectionAABB.yMin() * blockScaling[1]);
+                          y < size_t(ceil(intersectionAABB.yMax() * blockScaling[1])); ++y)
+                     {
+                        for (size_t x = size_t(intersectionAABB.xMin() * blockScaling[0]);
+                             x < size_t(ceil(intersectionAABB.xMax() * blockScaling[0])); ++x)
+                        {
+                           size_t index = z * subBlocksPerDim[0] * subBlocksPerDim[1] + y * subBlocksPerDim[0] + x;
+                           subBlocks[index].push_back(idxMapped);
+                        }
+                     }
+                  }
+               }
+            }
+            idxMapped++;
+         }
+      }
+
+      size_t maxParticlesPerSubBlock = 0;
+      std::for_each(subBlocks.begin(), subBlocks.end(), [&maxParticlesPerSubBlock](std::vector< size_t >& subBlock) {
+         maxParticlesPerSubBlock = std::max(maxParticlesPerSubBlock, subBlock.size());
+      });
+
+      size_t* numParticlesPerSubBlock = (size_t*) malloc(numSubBlocks * sizeof(size_t));
+      size_t* particleIDsSubBlocks    = (size_t*) malloc(numSubBlocks * maxParticlesPerSubBlock * sizeof(size_t));
+
+      // Copy data from std::vector to memory
+      for (size_t z = 0; z < subBlocksPerDim[2]; ++z)
+      {
+         for (size_t y = 0; y < subBlocksPerDim[1]; ++y)
+         {
+            for (size_t x = 0; x < subBlocksPerDim[0]; ++x)
+            {
+               size_t index = z * subBlocksPerDim[0] * subBlocksPerDim[1] + y * subBlocksPerDim[0] + x;
+               numParticlesPerSubBlock[index] = subBlocks[index].size();
+               for (size_t k = 0; k < subBlocks[index].size(); k++)
+               {
+                  particleIDsSubBlocks[index + k * numSubBlocks] = subBlocks[index][k];
+               }
+            }
+         }
+      }
+
+      mapParticles(*block, particleAndVolumeFractionSoA_, particleAndVolumeFractionSoA_.positions, radii, f_r,
+                   numParticlesPerSubBlock, particleIDsSubBlocks, subBlocksPerDim);
+
+      free(numParticlesPerSubBlock);
+      free(particleIDsSubBlocks);
+
+      free(radii);
+      free(f_r);
+   }
+
+   shared_ptr< StructuredBlockStorage > blockStorage_;
+   const shared_ptr< ParticleAccessor_T > ac_;
+   const ParticleSelector_T& mappingParticleSelector_;
+   ParticleAndVolumeFractionSoA_T< Weighting_T >& particleAndVolumeFractionSoA_;
+   const Vector3< uint_t > subBlockSize_;
+};
+
+} // namespace gpu
+} // namespace psm
+} // namespace lbm_mesapd_coupling
+} // namespace walberla
diff --git a/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMappingSweepsGPU.h b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMappingSweepsGPU.h
new file mode 100644
index 0000000000000000000000000000000000000000..7576d9476764d092a5c8c3defa4a85117f34b2d5
--- /dev/null
+++ b/src/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMappingSweepsGPU.h
@@ -0,0 +1,362 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ParticleAndVolumeFractionMappingSweepsGPU.h
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "domain_decomposition/StructuredBlockStorage.h"
+
+#include "field/GhostLayerField.h"
+
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/FieldIndexing.h"
+#include "gpu/GPUField.h"
+#include "gpu/HostFieldAllocator.h"
+#include "gpu/Kernel.h"
+
+#include "lbm_mesapd_coupling/DataTypesCodegen.h"
+#include "lbm_mesapd_coupling/mapping/ParticleBoundingBox.h"
+#include "lbm_mesapd_coupling/utility/ParticleSelector.h"
+
+#include "mesa_pd/common/AABBConversion.h"
+#include "mesa_pd/data/ParticleAccessorWithShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/kernel/SingleCast.h"
+
+#include <functional>
+#include <mesa_pd/data/ParticleStorage.h>
+#include <mesa_pd/data/shape/Sphere.h>
+
+#include "ParticleAndVolumeFractionMappingKernels.h"
+
+namespace walberla
+{
+namespace lbm_mesapd_coupling
+{
+namespace psm
+{
+namespace gpu
+{
+
+template< int Weighting_T >
+void mapParticles(const IBlock& blockIt,
+                  const ParticleAndVolumeFractionSoA_T< Weighting_T >& particleAndVolumeFractionSoA,
+                  const real_t* const spherePositions, const real_t* const sphereRadii, const real_t* const f_rs,
+                  const size_t* const numParticlesSubBlocks, const size_t* const particleIDsSubBlocks,
+                  const Vector3< uint_t > subBlocksPerDim)
+{
+   auto nOverlappingParticlesField =
+      blockIt.getData< nOverlappingParticlesFieldGPU_T >(particleAndVolumeFractionSoA.nOverlappingParticlesFieldID);
+   auto BsField  = blockIt.getData< BsFieldGPU_T >(particleAndVolumeFractionSoA.BsFieldID);
+   auto idxField = blockIt.getData< idxFieldGPU_T >(particleAndVolumeFractionSoA.idxFieldID);
+   auto BField   = blockIt.getData< BFieldGPU_T >(particleAndVolumeFractionSoA.BFieldID);
+
+   auto myKernel = walberla::gpu::make_kernel(&(linearApproximation< Weighting_T >) );
+   myKernel.addFieldIndexingParam(walberla::gpu::FieldIndexing< uint_t >::xyz(*nOverlappingParticlesField));
+   myKernel.addFieldIndexingParam(walberla::gpu::FieldIndexing< real_t >::xyz(*BsField));
+   myKernel.addFieldIndexingParam(walberla::gpu::FieldIndexing< id_t >::xyz(*idxField));
+   myKernel.addFieldIndexingParam(walberla::gpu::FieldIndexing< real_t >::xyz(*BField));
+   myKernel.addParam(particleAndVolumeFractionSoA.omega_);
+   myKernel.addParam(spherePositions);
+   myKernel.addParam(sphereRadii);
+   myKernel.addParam(f_rs);
+   Vector3< real_t > blockStart = blockIt.getAABB().minCorner();
+   myKernel.addParam(double3{ blockStart[0], blockStart[1], blockStart[2] });
+   myKernel.addParam(blockIt.getAABB().xSize() / real_t(nOverlappingParticlesField->xSize()));
+   myKernel.addParam(numParticlesSubBlocks);
+   myKernel.addParam(particleIDsSubBlocks);
+   myKernel.addParam(uint3{ uint(subBlocksPerDim[0]), uint(subBlocksPerDim[1]), uint(subBlocksPerDim[2]) });
+   myKernel();
+}
+
+template< typename ParticleAccessor_T, typename ParticleSelector_T, int Weighting_T >
+class SphereFractionMappingSweep
+{
+ public:
+   SphereFractionMappingSweep(const shared_ptr< StructuredBlockStorage >& blockStorage,
+                              const shared_ptr< ParticleAccessor_T >& ac,
+                              const ParticleSelector_T& mappingParticleSelector,
+                              ParticleAndVolumeFractionSoA_T< Weighting_T >& particleAndVolumeFractionSoA,
+                              const Vector3< uint_t > subBlockSize)
+      : blockStorage_(blockStorage), ac_(ac), mappingParticleSelector_(mappingParticleSelector),
+        particleAndVolumeFractionSoA_(particleAndVolumeFractionSoA), subBlockSize_(subBlockSize)
+   {
+      static_assert(std::is_base_of< mesa_pd::data::IAccessor, ParticleAccessor_T >::value,
+                    "Provide a valid accessor as template");
+      for (auto blockIt = blockStorage_->begin(); blockIt != blockStorage_->end(); ++blockIt)
+      {
+         auto aabb = blockIt->getAABB();
+         if (size_t(aabb.xSize()) % subBlockSize_[0] != 0 || size_t(aabb.ySize()) % subBlockSize_[1] != 0 ||
+             size_t(aabb.zSize()) % subBlockSize_[2] != 0)
+         {
+            WALBERLA_ABORT("Number of cells per block (" << aabb << ") is not divisible by subBlockSize ("
+                                                         << subBlockSize_ << ").")
+         }
+      }
+   }
+
+   void operator()(IBlock* block)
+   {
+      size_t numMappedParticles = 0;
+      particleAndVolumeFractionSoA_.mappingUIDs.clear();
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_))
+         {
+            numMappedParticles++;
+            // Store UIDs to make sure that the particles have not changed between the mapping and the PSM sweep
+            particleAndVolumeFractionSoA_.mappingUIDs.push_back(ac_->getUid(idx));
+         }
+      }
+
+      if (numMappedParticles == uint_t(0)) return;
+
+      // Allocate memory storing the particle information needed for the overlap fraction computations
+      const size_t scalarArraySize = numMappedParticles * sizeof(real_t);
+
+      if (particleAndVolumeFractionSoA_.positions != nullptr)
+      {
+         WALBERLA_GPU_CHECK(gpuFree(particleAndVolumeFractionSoA_.positions));
+      }
+      real_t* positions_h = (real_t*) malloc(3 * scalarArraySize);
+      real_t* radii_h     = (real_t*) malloc(scalarArraySize);
+      real_t* f_r_h = (real_t*) malloc(scalarArraySize); // f_r is described in https://doi.org/10.1108/EC-02-2016-0052
+
+      // Store particle information inside the memory
+      size_t idxMapped = 0;
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_))
+         {
+            for (size_t d = 0; d < 3; ++d)
+            {
+               positions_h[idxMapped * 3 + d] = ac_->getPosition(idx)[d];
+            }
+            // If other shapes than spheres are mapped, ignore them here
+            if (ac_->getShape(idx)->getShapeType() == mesa_pd::data::Sphere::SHAPE_TYPE)
+            {
+               const real_t radius = static_cast< mesa_pd::data::Sphere* >(ac_->getShape(idx))->getRadius();
+               radii_h[idxMapped]  = radius;
+               real_t Va           = real_t(
+                  (1.0 / 12.0 - radius * radius) * atan((0.5 * sqrt(radius * radius - 0.5)) / (0.5 - radius * radius)) +
+                  1.0 / 3.0 * sqrt(radius * radius - 0.5) +
+                  (radius * radius - 1.0 / 12.0) * atan(0.5 / sqrt(radius * radius - 0.5)) -
+                  4.0 / 3.0 * radius * radius * radius * atan(0.25 / (radius * sqrt(radius * radius - 0.5))));
+               f_r_h[idxMapped] = Va - radius + real_t(0.5);
+            }
+            idxMapped++;
+         }
+      }
+
+      // Update fraction mapping
+      // Split the block into sub-blocks and sort the particle indices into each overlapping sub-block. This way, in
+      // the particle mapping, each gpu thread only has to check the potentially overlapping particles.
+      auto blockAABB = block->getAABB();
+      const Vector3< uint_t > subBlocksPerDim =
+         Vector3< uint_t >(uint_t(blockAABB.xSize()) / subBlockSize_[0], uint_t(blockAABB.ySize()) / subBlockSize_[1],
+                           uint_t(blockAABB.zSize()) / subBlockSize_[2]);
+      const size_t numSubBlocks = subBlocksPerDim[0] * subBlocksPerDim[1] * subBlocksPerDim[2];
+      std::vector< std::vector< size_t > > subBlocks(numSubBlocks);
+
+      idxMapped = 0;
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_))
+         {
+            if (ac_->getShape(idx)->getShapeType() == mesa_pd::data::Sphere::SHAPE_TYPE)
+            {
+               auto sphereAABB = mesa_pd::getParticleAABB(idx, *ac_);
+               if (blockAABB.intersects(sphereAABB))
+               {
+                  auto intersectionAABB = blockAABB.getIntersection(sphereAABB);
+                  intersectionAABB.translate(-blockAABB.minCorner());
+                  mesa_pd::Vec3 blockScaling = mesa_pd::Vec3(real_t(subBlocksPerDim[0]) / blockAABB.sizes()[0],
+                                                             real_t(subBlocksPerDim[1]) / blockAABB.sizes()[1],
+                                                             real_t(subBlocksPerDim[2]) / blockAABB.sizes()[2]);
+
+                  for (size_t z = size_t(intersectionAABB.zMin() * blockScaling[2]);
+                       z < size_t(ceil(intersectionAABB.zMax() * blockScaling[2])); ++z)
+                  {
+                     for (size_t y = size_t(intersectionAABB.yMin() * blockScaling[1]);
+                          y < size_t(ceil(intersectionAABB.yMax() * blockScaling[1])); ++y)
+                     {
+                        for (size_t x = size_t(intersectionAABB.xMin() * blockScaling[0]);
+                             x < size_t(ceil(intersectionAABB.xMax() * blockScaling[0])); ++x)
+                        {
+                           size_t index = z * subBlocksPerDim[0] * subBlocksPerDim[1] + y * subBlocksPerDim[0] + x;
+                           subBlocks[index].push_back(idxMapped);
+                        }
+                     }
+                  }
+               }
+            }
+            idxMapped++;
+         }
+      }
+
+      size_t maxParticlesPerSubBlock = 0;
+      std::for_each(subBlocks.begin(), subBlocks.end(), [&maxParticlesPerSubBlock](std::vector< size_t >& subBlock) {
+         maxParticlesPerSubBlock = std::max(maxParticlesPerSubBlock, subBlock.size());
+      });
+
+      size_t* numParticlesPerSubBlock_h = (size_t*) malloc(numSubBlocks * sizeof(size_t));
+      size_t* particleIDsSubBlocks_h    = nullptr;
+      if (maxParticlesPerSubBlock > uint_t(0))
+      {
+         particleIDsSubBlocks_h = (size_t*) malloc(numSubBlocks * maxParticlesPerSubBlock * sizeof(size_t));
+      }
+      // Copy data from std::vector to memory
+      for (size_t z = 0; z < subBlocksPerDim[2]; ++z)
+      {
+         for (size_t y = 0; y < subBlocksPerDim[1]; ++y)
+         {
+            for (size_t x = 0; x < subBlocksPerDim[0]; ++x)
+            {
+               size_t index = z * subBlocksPerDim[0] * subBlocksPerDim[1] + y * subBlocksPerDim[0] + x;
+               numParticlesPerSubBlock_h[index] = subBlocks[index].size();
+               for (size_t k = 0; k < subBlocks[index].size(); k++)
+               {
+                  particleIDsSubBlocks_h[index + k * numSubBlocks] = subBlocks[index][k];
+               }
+            }
+         }
+      }
+
+      WALBERLA_GPU_CHECK(gpuMalloc(&(particleAndVolumeFractionSoA_.positions), 3 * scalarArraySize));
+      WALBERLA_GPU_CHECK(
+         gpuMemcpy(particleAndVolumeFractionSoA_.positions, positions_h, 3 * scalarArraySize, gpuMemcpyHostToDevice));
+
+      real_t* radii;
+      WALBERLA_GPU_CHECK(gpuMalloc(&radii, scalarArraySize));
+      WALBERLA_GPU_CHECK(gpuMemcpy(radii, radii_h, scalarArraySize, gpuMemcpyHostToDevice));
+
+      real_t* f_r;
+      WALBERLA_GPU_CHECK(gpuMalloc(&f_r, scalarArraySize));
+      WALBERLA_GPU_CHECK(gpuMemcpy(f_r, f_r_h, scalarArraySize, gpuMemcpyHostToDevice));
+
+      size_t* numParticlesPerSubBlock;
+      WALBERLA_GPU_CHECK(gpuMalloc(&numParticlesPerSubBlock, numSubBlocks * sizeof(size_t)));
+      WALBERLA_GPU_CHECK(gpuMemcpy(numParticlesPerSubBlock, numParticlesPerSubBlock_h, numSubBlocks * sizeof(size_t),
+                                   gpuMemcpyHostToDevice));
+
+      size_t* particleIDsSubBlocks;
+      if (maxParticlesPerSubBlock > uint_t(0))
+      {
+         WALBERLA_GPU_CHECK(gpuMalloc(&particleIDsSubBlocks, numSubBlocks * maxParticlesPerSubBlock * sizeof(size_t)));
+         WALBERLA_GPU_CHECK(gpuMemcpy(particleIDsSubBlocks, particleIDsSubBlocks_h,
+                                      numSubBlocks * maxParticlesPerSubBlock * sizeof(size_t), gpuMemcpyHostToDevice));
+      }
+
+      mapParticles(*block, particleAndVolumeFractionSoA_, particleAndVolumeFractionSoA_.positions, radii, f_r,
+                   numParticlesPerSubBlock, particleIDsSubBlocks, subBlocksPerDim);
+
+      WALBERLA_GPU_CHECK(gpuFree(numParticlesPerSubBlock));
+      free(numParticlesPerSubBlock_h);
+
+      if (maxParticlesPerSubBlock > uint_t(0))
+      {
+         WALBERLA_GPU_CHECK(gpuFree(particleIDsSubBlocks));
+         free(particleIDsSubBlocks_h);
+      }
+
+      WALBERLA_GPU_CHECK(gpuFree(radii));
+      free(radii_h);
+
+      WALBERLA_GPU_CHECK(gpuFree(f_r));
+      free(f_r_h);
+
+      free(positions_h);
+   }
+
+   shared_ptr< StructuredBlockStorage > blockStorage_;
+   const shared_ptr< ParticleAccessor_T > ac_;
+   const ParticleSelector_T& mappingParticleSelector_;
+   ParticleAndVolumeFractionSoA_T< Weighting_T >& particleAndVolumeFractionSoA_;
+   const Vector3< uint_t > subBlockSize_;
+};
+
+template< typename ParticleAccessor_T, typename ParticleSelector_T, int Weighting_T >
+class BoxFractionMappingSweep
+{
+ public:
+   BoxFractionMappingSweep(const shared_ptr< StructuredBlockStorage >& blockStorage,
+                           const shared_ptr< ParticleAccessor_T >& ac, const uint_t boxUid,
+                           const Vector3< real_t > boxEdgeLength,
+                           ParticleAndVolumeFractionSoA_T< Weighting_T >& particleAndVolumeFractionSoA,
+                           const ParticleSelector_T& mappingParticleSelector)
+      : blockStorage_(blockStorage), ac_(ac), boxUid_(boxUid), boxEdgeLength_(boxEdgeLength),
+        particleAndVolumeFractionSoA_(particleAndVolumeFractionSoA), mappingParticleSelector_(mappingParticleSelector)
+   {
+      static_assert(std::is_base_of< mesa_pd::data::IAccessor, ParticleAccessor_T >::value,
+                    "Provide a valid accessor as template");
+   }
+
+   void operator()(IBlock* block)
+   {
+      auto nOverlappingParticlesField =
+         block->getData< nOverlappingParticlesFieldGPU_T >(particleAndVolumeFractionSoA_.nOverlappingParticlesFieldID);
+      auto BsField  = block->getData< BsFieldGPU_T >(particleAndVolumeFractionSoA_.BsFieldID);
+      auto idxField = block->getData< idxFieldGPU_T >(particleAndVolumeFractionSoA_.idxFieldID);
+      auto BField   = block->getData< BFieldGPU_T >(particleAndVolumeFractionSoA_.BFieldID);
+
+      auto myKernel = walberla::gpu::make_kernel(&(boxMapping< Weighting_T >) );
+      myKernel.addFieldIndexingParam(walberla::gpu::FieldIndexing< uint_t >::xyz(*nOverlappingParticlesField));
+      myKernel.addFieldIndexingParam(walberla::gpu::FieldIndexing< real_t >::xyz(*BsField));
+      myKernel.addFieldIndexingParam(walberla::gpu::FieldIndexing< id_t >::xyz(*idxField));
+      myKernel.addFieldIndexingParam(walberla::gpu::FieldIndexing< real_t >::xyz(*BField));
+      myKernel.addParam(particleAndVolumeFractionSoA_.omega_);
+      const Vector3< real_t > boxPosition = ac_->getPosition(ac_->uidToIdx(boxUid_));
+      myKernel.addParam(double3{ boxPosition[0] - boxEdgeLength_[0] / real_t(2),
+                                 boxPosition[1] - boxEdgeLength_[1] / real_t(2),
+                                 boxPosition[2] - boxEdgeLength_[2] / real_t(2) });
+      myKernel.addParam(double3{ boxPosition[0] + boxEdgeLength_[0] / real_t(2),
+                                 boxPosition[1] + boxEdgeLength_[1] / real_t(2),
+                                 boxPosition[2] + boxEdgeLength_[2] / real_t(2) });
+      Vector3< real_t > blockStart = block->getAABB().minCorner();
+      myKernel.addParam(double3{ blockStart[0], blockStart[1], blockStart[2] });
+      myKernel.addParam(block->getAABB().xSize() / real_t(nOverlappingParticlesField->xSize()));
+
+      // Determine the index of the box among the mapped particles
+      size_t idxMapped = 0;
+      for (size_t idx = 0; idx < ac_->size(); ++idx)
+      {
+         if (mappingParticleSelector_(idx, *ac_))
+         {
+            if (ac_->getUid(idx) == boxUid_) { break; }
+            idxMapped++;
+         }
+      }
+      myKernel.addParam(idxMapped);
+      myKernel();
+   }
+
+   shared_ptr< StructuredBlockStorage > blockStorage_;
+   const shared_ptr< ParticleAccessor_T > ac_;
+   const uint_t boxUid_;
+   const Vector3< real_t > boxEdgeLength_;
+   ParticleAndVolumeFractionSoA_T< Weighting_T >& particleAndVolumeFractionSoA_;
+   const ParticleSelector_T& mappingParticleSelector_;
+};
+
+} // namespace gpu
+} // namespace psm
+} // namespace lbm_mesapd_coupling
+} // namespace walberla
diff --git a/src/mesa_pd/common/ParticleFunctions.h b/src/mesa_pd/common/ParticleFunctions.h
index 155ca3af1a8b2fd9627f5005f6def49c8b98092d..14397c39c6c3e8ff89a041bd01bcd556b2374215 100644
--- a/src/mesa_pd/common/ParticleFunctions.h
+++ b/src/mesa_pd/common/ParticleFunctions.h
@@ -88,23 +88,23 @@ inline Mat3 getInertia(const size_t p_idx, Accessor& ac)
 template <typename Accessor>
 inline void addForceAtomic(const size_t p_idx, Accessor& ac, const Vec3& f)
 {
-   // Increasing the force and torque on this particle;
-   ac.getForceRef(p_idx)[0]  += f[0];;
-   ac.getForceRef(p_idx)[1]  += f[1];;
+   // Increasing the force and torque on this particle
+   ac.getForceRef(p_idx)[0]  += f[0];
+   ac.getForceRef(p_idx)[1]  += f[1];
    ac.getForceRef(p_idx)[2]  += f[2];
 }
 
 template <typename Accessor>
 inline void addForceAtWFPosAtomic(const size_t p_idx, Accessor& ac, const Vec3& f, const Vec3& wf_pt)
 {
-   // Increasing the force and torque on this particle;
-   ac.getForceRef(p_idx)[0]  += f[0];;
-   ac.getForceRef(p_idx)[1]  += f[1];;
+   // Increasing the force and torque on this particle
+   ac.getForceRef(p_idx)[0]  += f[0];
+   ac.getForceRef(p_idx)[1]  += f[1];
    ac.getForceRef(p_idx)[2]  += f[2];
 
-   const auto t = cross(( wf_pt - ac.getPosition(p_idx) ), f);;
-   ac.getTorqueRef(p_idx)[0] += t[0];;
-   ac.getTorqueRef(p_idx)[1] += t[1];;
+   const auto t = cross(( wf_pt - ac.getPosition(p_idx) ), f);
+   ac.getTorqueRef(p_idx)[0] += t[0];
+   ac.getTorqueRef(p_idx)[1] += t[1];
    ac.getTorqueRef(p_idx)[2] += t[2];
 }
 
@@ -114,9 +114,9 @@ inline void addForceAtWFPosAtomic(const size_t p_idx, Accessor& ac, const Vec3&
 template <typename Accessor>
 inline void addTorqueAtomic(const size_t p_idx, Accessor& ac, const Vec3& t)
 {
-   // Increasing the torque on this particle;
-   ac.getTorqueRef(p_idx)[0]  += t[0];;
-   ac.getTorqueRef(p_idx)[1]  += t[1];;
+   // Increasing the torque on this particle
+   ac.getTorqueRef(p_idx)[0]  += t[0];
+   ac.getTorqueRef(p_idx)[1]  += t[1];
    ac.getTorqueRef(p_idx)[2]  += t[2];
 }
 
diff --git a/src/mesa_pd/data/ParticleAccessor.h b/src/mesa_pd/data/ParticleAccessor.h
index f18c0105a00ea1b45a2a97c4f0c2eb2803b74a93..ea82d84f93fc49a4400720aebfcde5bbcac93b8b 100644
--- a/src/mesa_pd/data/ParticleAccessor.h
+++ b/src/mesa_pd/data/ParticleAccessor.h
@@ -88,6 +88,10 @@ public:
    walberla::mesa_pd::Vec3& getOldForceRef(const size_t p_idx) {return ps_->getOldForceRef(p_idx);}
    void setOldForce(const size_t p_idx, walberla::mesa_pd::Vec3 const & v) { ps_->setOldForce(p_idx, v);}
    
+   walberla::real_t const & getCharge(const size_t p_idx) const {return ps_->getCharge(p_idx);}
+   walberla::real_t& getChargeRef(const size_t p_idx) {return ps_->getChargeRef(p_idx);}
+   void setCharge(const size_t p_idx, walberla::real_t const & v) { ps_->setCharge(p_idx, v);}
+   
    size_t const & getShapeID(const size_t p_idx) const {return ps_->getShapeID(p_idx);}
    size_t& getShapeIDRef(const size_t p_idx) {return ps_->getShapeIDRef(p_idx);}
    void setShapeID(const size_t p_idx, size_t const & v) { ps_->setShapeID(p_idx, v);}
@@ -172,6 +176,18 @@ public:
    walberla::mesa_pd::Vec3& getOldHydrodynamicTorqueRef(const size_t p_idx) {return ps_->getOldHydrodynamicTorqueRef(p_idx);}
    void setOldHydrodynamicTorque(const size_t p_idx, walberla::mesa_pd::Vec3 const & v) { ps_->setOldHydrodynamicTorque(p_idx, v);}
    
+   walberla::mesa_pd::Vec3 const & getElectrostaticForce(const size_t p_idx) const {return ps_->getElectrostaticForce(p_idx);}
+   walberla::mesa_pd::Vec3& getElectrostaticForceRef(const size_t p_idx) {return ps_->getElectrostaticForceRef(p_idx);}
+   void setElectrostaticForce(const size_t p_idx, walberla::mesa_pd::Vec3 const & v) { ps_->setElectrostaticForce(p_idx, v);}
+   
+   walberla::real_t const & getTotalDisplacement(const size_t p_idx) const {return ps_->getTotalDisplacement(p_idx);}
+   walberla::real_t& getTotalDisplacementRef(const size_t p_idx) {return ps_->getTotalDisplacementRef(p_idx);}
+   void setTotalDisplacement(const size_t p_idx, walberla::real_t const & v) { ps_->setTotalDisplacement(p_idx, v);}
+   
+   walberla::real_t const & getCollisionForceNorm(const size_t p_idx) const {return ps_->getCollisionForceNorm(p_idx);}
+   walberla::real_t& getCollisionForceNormRef(const size_t p_idx) {return ps_->getCollisionForceNormRef(p_idx);}
+   void setCollisionForceNorm(const size_t p_idx, walberla::real_t const & v) { ps_->setCollisionForceNorm(p_idx, v);}
+   
    walberla::real_t const & getVirtualMass(const size_t p_idx) const {return ps_->getVirtualMass(p_idx);}
    walberla::real_t& getVirtualMassRef(const size_t p_idx) {return ps_->getVirtualMassRef(p_idx);}
    void setVirtualMass(const size_t p_idx, walberla::real_t const & v) { ps_->setVirtualMass(p_idx, v);}
@@ -297,6 +313,10 @@ public:
    void setOldForce(const size_t /*p_idx*/, walberla::mesa_pd::Vec3 const & v) { oldForce_ = v;}
    walberla::mesa_pd::Vec3& getOldForceRef(const size_t /*p_idx*/) {return oldForce_;}
    
+   walberla::real_t const & getCharge(const size_t /*p_idx*/) const {return charge_;}
+   void setCharge(const size_t /*p_idx*/, walberla::real_t const & v) { charge_ = v;}
+   walberla::real_t& getChargeRef(const size_t /*p_idx*/) {return charge_;}
+   
    size_t const & getShapeID(const size_t /*p_idx*/) const {return shapeID_;}
    void setShapeID(const size_t /*p_idx*/, size_t const & v) { shapeID_ = v;}
    size_t& getShapeIDRef(const size_t /*p_idx*/) {return shapeID_;}
@@ -381,6 +401,18 @@ public:
    void setOldHydrodynamicTorque(const size_t /*p_idx*/, walberla::mesa_pd::Vec3 const & v) { oldHydrodynamicTorque_ = v;}
    walberla::mesa_pd::Vec3& getOldHydrodynamicTorqueRef(const size_t /*p_idx*/) {return oldHydrodynamicTorque_;}
    
+   walberla::mesa_pd::Vec3 const & getElectrostaticForce(const size_t /*p_idx*/) const {return electrostaticForce_;}
+   void setElectrostaticForce(const size_t /*p_idx*/, walberla::mesa_pd::Vec3 const & v) { electrostaticForce_ = v;}
+   walberla::mesa_pd::Vec3& getElectrostaticForceRef(const size_t /*p_idx*/) {return electrostaticForce_;}
+   
+   walberla::real_t const & getTotalDisplacement(const size_t /*p_idx*/) const {return totalDisplacement_;}
+   void setTotalDisplacement(const size_t /*p_idx*/, walberla::real_t const & v) { totalDisplacement_ = v;}
+   walberla::real_t& getTotalDisplacementRef(const size_t /*p_idx*/) {return totalDisplacement_;}
+   
+   walberla::real_t const & getCollisionForceNorm(const size_t /*p_idx*/) const {return collisionForceNorm_;}
+   void setCollisionForceNorm(const size_t /*p_idx*/, walberla::real_t const & v) { collisionForceNorm_ = v;}
+   walberla::real_t& getCollisionForceNormRef(const size_t /*p_idx*/) {return collisionForceNorm_;}
+   
    walberla::real_t const & getVirtualMass(const size_t /*p_idx*/) const {return virtualMass_;}
    void setVirtualMass(const size_t /*p_idx*/, walberla::real_t const & v) { virtualMass_ = v;}
    walberla::real_t& getVirtualMassRef(const size_t /*p_idx*/) {return virtualMass_;}
@@ -442,6 +474,7 @@ private:
    walberla::real_t invMass_;
    walberla::mesa_pd::Vec3 force_;
    walberla::mesa_pd::Vec3 oldForce_;
+   walberla::real_t charge_;
    size_t shapeID_;
    std::shared_ptr<walberla::mesa_pd::data::BaseShape> baseShape_;
    walberla::mesa_pd::Rot3 rotation_;
@@ -463,6 +496,9 @@ private:
    walberla::mesa_pd::Vec3 hydrodynamicTorque_;
    walberla::mesa_pd::Vec3 oldHydrodynamicForce_;
    walberla::mesa_pd::Vec3 oldHydrodynamicTorque_;
+   walberla::mesa_pd::Vec3 electrostaticForce_;
+   walberla::real_t totalDisplacement_;
+   walberla::real_t collisionForceNorm_;
    walberla::real_t virtualMass_;
    walberla::real_t invMassIncludingVirtual_;
    walberla::mesa_pd::Vec3 oldLinearAcceleration_;
diff --git a/src/mesa_pd/data/ParticleStorage.h b/src/mesa_pd/data/ParticleStorage.h
index a25c9d29e0ccf2c5e4eb53e554a004116b7428c0..ef37bbe0303d22f5069dc54558d4c3c7acd2d37c 100644
--- a/src/mesa_pd/data/ParticleStorage.h
+++ b/src/mesa_pd/data/ParticleStorage.h
@@ -82,6 +82,7 @@ public:
       using invMass_type = walberla::real_t;
       using force_type = walberla::mesa_pd::Vec3;
       using oldForce_type = walberla::mesa_pd::Vec3;
+      using charge_type = walberla::real_t;
       using shapeID_type = size_t;
       using baseShape_type = std::shared_ptr<walberla::mesa_pd::data::BaseShape>;
       using rotation_type = walberla::mesa_pd::Rot3;
@@ -103,6 +104,9 @@ public:
       using hydrodynamicTorque_type = walberla::mesa_pd::Vec3;
       using oldHydrodynamicForce_type = walberla::mesa_pd::Vec3;
       using oldHydrodynamicTorque_type = walberla::mesa_pd::Vec3;
+      using electrostaticForce_type = walberla::mesa_pd::Vec3;
+      using totalDisplacement_type = walberla::real_t;
+      using collisionForceNorm_type = walberla::real_t;
       using virtualMass_type = walberla::real_t;
       using invMassIncludingVirtual_type = walberla::real_t;
       using oldLinearAcceleration_type = walberla::mesa_pd::Vec3;
@@ -155,6 +159,10 @@ public:
       oldForce_type& getOldForceRef() {return storage_.getOldForceRef(i_);}
       void setOldForce(oldForce_type const & v) { storage_.setOldForce(i_, v);}
       
+      charge_type const & getCharge() const {return storage_.getCharge(i_);}
+      charge_type& getChargeRef() {return storage_.getChargeRef(i_);}
+      void setCharge(charge_type const & v) { storage_.setCharge(i_, v);}
+      
       shapeID_type const & getShapeID() const {return storage_.getShapeID(i_);}
       shapeID_type& getShapeIDRef() {return storage_.getShapeIDRef(i_);}
       void setShapeID(shapeID_type const & v) { storage_.setShapeID(i_, v);}
@@ -239,6 +247,18 @@ public:
       oldHydrodynamicTorque_type& getOldHydrodynamicTorqueRef() {return storage_.getOldHydrodynamicTorqueRef(i_);}
       void setOldHydrodynamicTorque(oldHydrodynamicTorque_type const & v) { storage_.setOldHydrodynamicTorque(i_, v);}
       
+      electrostaticForce_type const & getElectrostaticForce() const {return storage_.getElectrostaticForce(i_);}
+      electrostaticForce_type& getElectrostaticForceRef() {return storage_.getElectrostaticForceRef(i_);}
+      void setElectrostaticForce(electrostaticForce_type const & v) { storage_.setElectrostaticForce(i_, v);}
+      
+      totalDisplacement_type const & getTotalDisplacement() const {return storage_.getTotalDisplacement(i_);}
+      totalDisplacement_type& getTotalDisplacementRef() {return storage_.getTotalDisplacementRef(i_);}
+      void setTotalDisplacement(totalDisplacement_type const & v) { storage_.setTotalDisplacement(i_, v);}
+      
+      collisionForceNorm_type const & getCollisionForceNorm() const {return storage_.getCollisionForceNorm(i_);}
+      collisionForceNorm_type& getCollisionForceNormRef() {return storage_.getCollisionForceNormRef(i_);}
+      void setCollisionForceNorm(collisionForceNorm_type const & v) { storage_.setCollisionForceNorm(i_, v);}
+      
       virtualMass_type const & getVirtualMass() const {return storage_.getVirtualMass(i_);}
       virtualMass_type& getVirtualMassRef() {return storage_.getVirtualMassRef(i_);}
       void setVirtualMass(virtualMass_type const & v) { storage_.setVirtualMass(i_, v);}
@@ -349,6 +369,7 @@ public:
    using invMass_type = walberla::real_t;
    using force_type = walberla::mesa_pd::Vec3;
    using oldForce_type = walberla::mesa_pd::Vec3;
+   using charge_type = walberla::real_t;
    using shapeID_type = size_t;
    using baseShape_type = std::shared_ptr<walberla::mesa_pd::data::BaseShape>;
    using rotation_type = walberla::mesa_pd::Rot3;
@@ -370,6 +391,9 @@ public:
    using hydrodynamicTorque_type = walberla::mesa_pd::Vec3;
    using oldHydrodynamicForce_type = walberla::mesa_pd::Vec3;
    using oldHydrodynamicTorque_type = walberla::mesa_pd::Vec3;
+   using electrostaticForce_type = walberla::mesa_pd::Vec3;
+   using totalDisplacement_type = walberla::real_t;
+   using collisionForceNorm_type = walberla::real_t;
    using virtualMass_type = walberla::real_t;
    using invMassIncludingVirtual_type = walberla::real_t;
    using oldLinearAcceleration_type = walberla::mesa_pd::Vec3;
@@ -422,6 +446,10 @@ public:
    oldForce_type& getOldForceRef(const size_t idx) {return oldForce_[idx];}
    void setOldForce(const size_t idx, oldForce_type const & v) { oldForce_[idx] = v; }
    
+   charge_type const & getCharge(const size_t idx) const {return charge_[idx];}
+   charge_type& getChargeRef(const size_t idx) {return charge_[idx];}
+   void setCharge(const size_t idx, charge_type const & v) { charge_[idx] = v; }
+   
    shapeID_type const & getShapeID(const size_t idx) const {return shapeID_[idx];}
    shapeID_type& getShapeIDRef(const size_t idx) {return shapeID_[idx];}
    void setShapeID(const size_t idx, shapeID_type const & v) { shapeID_[idx] = v; }
@@ -506,6 +534,18 @@ public:
    oldHydrodynamicTorque_type& getOldHydrodynamicTorqueRef(const size_t idx) {return oldHydrodynamicTorque_[idx];}
    void setOldHydrodynamicTorque(const size_t idx, oldHydrodynamicTorque_type const & v) { oldHydrodynamicTorque_[idx] = v; }
    
+   electrostaticForce_type const & getElectrostaticForce(const size_t idx) const {return electrostaticForce_[idx];}
+   electrostaticForce_type& getElectrostaticForceRef(const size_t idx) {return electrostaticForce_[idx];}
+   void setElectrostaticForce(const size_t idx, electrostaticForce_type const & v) { electrostaticForce_[idx] = v; }
+   
+   totalDisplacement_type const & getTotalDisplacement(const size_t idx) const {return totalDisplacement_[idx];}
+   totalDisplacement_type& getTotalDisplacementRef(const size_t idx) {return totalDisplacement_[idx];}
+   void setTotalDisplacement(const size_t idx, totalDisplacement_type const & v) { totalDisplacement_[idx] = v; }
+   
+   collisionForceNorm_type const & getCollisionForceNorm(const size_t idx) const {return collisionForceNorm_[idx];}
+   collisionForceNorm_type& getCollisionForceNormRef(const size_t idx) {return collisionForceNorm_[idx];}
+   void setCollisionForceNorm(const size_t idx, collisionForceNorm_type const & v) { collisionForceNorm_[idx] = v; }
+   
    virtualMass_type const & getVirtualMass(const size_t idx) const {return virtualMass_[idx];}
    virtualMass_type& getVirtualMassRef(const size_t idx) {return virtualMass_[idx];}
    void setVirtualMass(const size_t idx, virtualMass_type const & v) { virtualMass_[idx] = v; }
@@ -647,6 +687,7 @@ public:
    std::vector<invMass_type> invMass_ {};
    std::vector<force_type> force_ {};
    std::vector<oldForce_type> oldForce_ {};
+   std::vector<charge_type> charge_ {};
    std::vector<shapeID_type> shapeID_ {};
    std::vector<baseShape_type> baseShape_ {};
    std::vector<rotation_type> rotation_ {};
@@ -668,6 +709,9 @@ public:
    std::vector<hydrodynamicTorque_type> hydrodynamicTorque_ {};
    std::vector<oldHydrodynamicForce_type> oldHydrodynamicForce_ {};
    std::vector<oldHydrodynamicTorque_type> oldHydrodynamicTorque_ {};
+   std::vector<electrostaticForce_type> electrostaticForce_ {};
+   std::vector<totalDisplacement_type> totalDisplacement_ {};
+   std::vector<collisionForceNorm_type> collisionForceNorm_ {};
    std::vector<virtualMass_type> virtualMass_ {};
    std::vector<invMassIncludingVirtual_type> invMassIncludingVirtual_ {};
    std::vector<oldLinearAcceleration_type> oldLinearAcceleration_ {};
@@ -697,6 +741,7 @@ ParticleStorage::Particle& ParticleStorage::Particle::operator=(const ParticleSt
    getInvMassRef() = rhs.getInvMass();
    getForceRef() = rhs.getForce();
    getOldForceRef() = rhs.getOldForce();
+   getChargeRef() = rhs.getCharge();
    getShapeIDRef() = rhs.getShapeID();
    getBaseShapeRef() = rhs.getBaseShape();
    getRotationRef() = rhs.getRotation();
@@ -718,6 +763,9 @@ ParticleStorage::Particle& ParticleStorage::Particle::operator=(const ParticleSt
    getHydrodynamicTorqueRef() = rhs.getHydrodynamicTorque();
    getOldHydrodynamicForceRef() = rhs.getOldHydrodynamicForce();
    getOldHydrodynamicTorqueRef() = rhs.getOldHydrodynamicTorque();
+   getElectrostaticForceRef() = rhs.getElectrostaticForce();
+   getTotalDisplacementRef() = rhs.getTotalDisplacement();
+   getCollisionForceNormRef() = rhs.getCollisionForceNorm();
    getVirtualMassRef() = rhs.getVirtualMass();
    getInvMassIncludingVirtualRef() = rhs.getInvMassIncludingVirtual();
    getOldLinearAccelerationRef() = rhs.getOldLinearAcceleration();
@@ -744,6 +792,7 @@ ParticleStorage::Particle& ParticleStorage::Particle::operator=(ParticleStorage:
    getInvMassRef() = std::move(rhs.getInvMassRef());
    getForceRef() = std::move(rhs.getForceRef());
    getOldForceRef() = std::move(rhs.getOldForceRef());
+   getChargeRef() = std::move(rhs.getChargeRef());
    getShapeIDRef() = std::move(rhs.getShapeIDRef());
    getBaseShapeRef() = std::move(rhs.getBaseShapeRef());
    getRotationRef() = std::move(rhs.getRotationRef());
@@ -765,6 +814,9 @@ ParticleStorage::Particle& ParticleStorage::Particle::operator=(ParticleStorage:
    getHydrodynamicTorqueRef() = std::move(rhs.getHydrodynamicTorqueRef());
    getOldHydrodynamicForceRef() = std::move(rhs.getOldHydrodynamicForceRef());
    getOldHydrodynamicTorqueRef() = std::move(rhs.getOldHydrodynamicTorqueRef());
+   getElectrostaticForceRef() = std::move(rhs.getElectrostaticForceRef());
+   getTotalDisplacementRef() = std::move(rhs.getTotalDisplacementRef());
+   getCollisionForceNormRef() = std::move(rhs.getCollisionForceNormRef());
    getVirtualMassRef() = std::move(rhs.getVirtualMassRef());
    getInvMassIncludingVirtualRef() = std::move(rhs.getInvMassIncludingVirtualRef());
    getOldLinearAccelerationRef() = std::move(rhs.getOldLinearAccelerationRef());
@@ -792,6 +844,7 @@ void swap(ParticleStorage::Particle lhs, ParticleStorage::Particle rhs)
    std::swap(lhs.getInvMassRef(), rhs.getInvMassRef());
    std::swap(lhs.getForceRef(), rhs.getForceRef());
    std::swap(lhs.getOldForceRef(), rhs.getOldForceRef());
+   std::swap(lhs.getChargeRef(), rhs.getChargeRef());
    std::swap(lhs.getShapeIDRef(), rhs.getShapeIDRef());
    std::swap(lhs.getBaseShapeRef(), rhs.getBaseShapeRef());
    std::swap(lhs.getRotationRef(), rhs.getRotationRef());
@@ -813,6 +866,9 @@ void swap(ParticleStorage::Particle lhs, ParticleStorage::Particle rhs)
    std::swap(lhs.getHydrodynamicTorqueRef(), rhs.getHydrodynamicTorqueRef());
    std::swap(lhs.getOldHydrodynamicForceRef(), rhs.getOldHydrodynamicForceRef());
    std::swap(lhs.getOldHydrodynamicTorqueRef(), rhs.getOldHydrodynamicTorqueRef());
+   std::swap(lhs.getElectrostaticForceRef(), rhs.getElectrostaticForceRef());
+   std::swap(lhs.getTotalDisplacementRef(), rhs.getTotalDisplacementRef());
+   std::swap(lhs.getCollisionForceNormRef(), rhs.getCollisionForceNormRef());
    std::swap(lhs.getVirtualMassRef(), rhs.getVirtualMassRef());
    std::swap(lhs.getInvMassIncludingVirtualRef(), rhs.getInvMassIncludingVirtualRef());
    std::swap(lhs.getOldLinearAccelerationRef(), rhs.getOldLinearAccelerationRef());
@@ -840,6 +896,7 @@ std::ostream& operator<<( std::ostream& os, const ParticleStorage::Particle& p )
          "invMass             : " << p.getInvMass() << "\n" <<
          "force               : " << p.getForce() << "\n" <<
          "oldForce            : " << p.getOldForce() << "\n" <<
+         "charge              : " << p.getCharge() << "\n" <<
          "shapeID             : " << p.getShapeID() << "\n" <<
          "baseShape           : " << p.getBaseShape() << "\n" <<
          "rotation            : " << p.getRotation() << "\n" <<
@@ -861,6 +918,9 @@ std::ostream& operator<<( std::ostream& os, const ParticleStorage::Particle& p )
          "hydrodynamicTorque  : " << p.getHydrodynamicTorque() << "\n" <<
          "oldHydrodynamicForce: " << p.getOldHydrodynamicForce() << "\n" <<
          "oldHydrodynamicTorque: " << p.getOldHydrodynamicTorque() << "\n" <<
+         "electrostaticForce  : " << p.getElectrostaticForce() << "\n" <<
+         "totalDisplacement   : " << p.getTotalDisplacement() << "\n" <<
+         "collisionForceNorm  : " << p.getCollisionForceNorm() << "\n" <<
          "virtualMass         : " << p.getVirtualMass() << "\n" <<
          "invMassIncludingVirtual: " << p.getInvMassIncludingVirtual() << "\n" <<
          "oldLinearAcceleration: " << p.getOldLinearAcceleration() << "\n" <<
@@ -958,6 +1018,7 @@ inline ParticleStorage::iterator ParticleStorage::create(const id_t& uid)
    invMass_.emplace_back(real_t(1));
    force_.emplace_back(real_t(0));
    oldForce_.emplace_back(real_t(0));
+   charge_.emplace_back(real_t(0));
    shapeID_.emplace_back();
    baseShape_.emplace_back(make_shared<walberla::mesa_pd::data::BaseShape>());
    rotation_.emplace_back();
@@ -979,6 +1040,9 @@ inline ParticleStorage::iterator ParticleStorage::create(const id_t& uid)
    hydrodynamicTorque_.emplace_back(real_t(0));
    oldHydrodynamicForce_.emplace_back(real_t(0));
    oldHydrodynamicTorque_.emplace_back(real_t(0));
+   electrostaticForce_.emplace_back(real_t(0));
+   totalDisplacement_.emplace_back(real_t(0));
+   collisionForceNorm_.emplace_back(real_t(0));
    virtualMass_.emplace_back(real_t(0));
    invMassIncludingVirtual_.emplace_back(real_t(0));
    oldLinearAcceleration_.emplace_back(real_t(0));
@@ -1031,6 +1095,7 @@ inline ParticleStorage::iterator ParticleStorage::erase(iterator& it)
    invMass_.pop_back();
    force_.pop_back();
    oldForce_.pop_back();
+   charge_.pop_back();
    shapeID_.pop_back();
    baseShape_.pop_back();
    rotation_.pop_back();
@@ -1052,6 +1117,9 @@ inline ParticleStorage::iterator ParticleStorage::erase(iterator& it)
    hydrodynamicTorque_.pop_back();
    oldHydrodynamicForce_.pop_back();
    oldHydrodynamicTorque_.pop_back();
+   electrostaticForce_.pop_back();
+   totalDisplacement_.pop_back();
+   collisionForceNorm_.pop_back();
    virtualMass_.pop_back();
    invMassIncludingVirtual_.pop_back();
    oldLinearAcceleration_.pop_back();
@@ -1091,6 +1159,7 @@ inline void ParticleStorage::reserve(const size_t size)
    invMass_.reserve(size);
    force_.reserve(size);
    oldForce_.reserve(size);
+   charge_.reserve(size);
    shapeID_.reserve(size);
    baseShape_.reserve(size);
    rotation_.reserve(size);
@@ -1112,6 +1181,9 @@ inline void ParticleStorage::reserve(const size_t size)
    hydrodynamicTorque_.reserve(size);
    oldHydrodynamicForce_.reserve(size);
    oldHydrodynamicTorque_.reserve(size);
+   electrostaticForce_.reserve(size);
+   totalDisplacement_.reserve(size);
+   collisionForceNorm_.reserve(size);
    virtualMass_.reserve(size);
    invMassIncludingVirtual_.reserve(size);
    oldLinearAcceleration_.reserve(size);
@@ -1136,6 +1208,7 @@ inline void ParticleStorage::clear()
    invMass_.clear();
    force_.clear();
    oldForce_.clear();
+   charge_.clear();
    shapeID_.clear();
    baseShape_.clear();
    rotation_.clear();
@@ -1157,6 +1230,9 @@ inline void ParticleStorage::clear()
    hydrodynamicTorque_.clear();
    oldHydrodynamicForce_.clear();
    oldHydrodynamicTorque_.clear();
+   electrostaticForce_.clear();
+   totalDisplacement_.clear();
+   collisionForceNorm_.clear();
    virtualMass_.clear();
    invMassIncludingVirtual_.clear();
    oldLinearAcceleration_.clear();
@@ -1182,6 +1258,7 @@ inline size_t ParticleStorage::size() const
    //WALBERLA_ASSERT_EQUAL( uid_.size(), invMass.size() );
    //WALBERLA_ASSERT_EQUAL( uid_.size(), force.size() );
    //WALBERLA_ASSERT_EQUAL( uid_.size(), oldForce.size() );
+   //WALBERLA_ASSERT_EQUAL( uid_.size(), charge.size() );
    //WALBERLA_ASSERT_EQUAL( uid_.size(), shapeID.size() );
    //WALBERLA_ASSERT_EQUAL( uid_.size(), baseShape.size() );
    //WALBERLA_ASSERT_EQUAL( uid_.size(), rotation.size() );
@@ -1203,6 +1280,9 @@ inline size_t ParticleStorage::size() const
    //WALBERLA_ASSERT_EQUAL( uid_.size(), hydrodynamicTorque.size() );
    //WALBERLA_ASSERT_EQUAL( uid_.size(), oldHydrodynamicForce.size() );
    //WALBERLA_ASSERT_EQUAL( uid_.size(), oldHydrodynamicTorque.size() );
+   //WALBERLA_ASSERT_EQUAL( uid_.size(), electrostaticForce.size() );
+   //WALBERLA_ASSERT_EQUAL( uid_.size(), totalDisplacement.size() );
+   //WALBERLA_ASSERT_EQUAL( uid_.size(), collisionForceNorm.size() );
    //WALBERLA_ASSERT_EQUAL( uid_.size(), virtualMass.size() );
    //WALBERLA_ASSERT_EQUAL( uid_.size(), invMassIncludingVirtual.size() );
    //WALBERLA_ASSERT_EQUAL( uid_.size(), oldLinearAcceleration.size() );
@@ -1469,6 +1549,15 @@ public:
    walberla::mesa_pd::Vec3 const & operator()(const data::Particle& p) const {return p.getOldForce();}
 };
 ///Predicate that selects a certain property from a Particle
+class SelectParticleCharge
+{
+public:
+   using return_type = walberla::real_t;
+   walberla::real_t& operator()(data::Particle& p) const {return p.getChargeRef();}
+   walberla::real_t& operator()(data::Particle&& p) const {return p.getChargeRef();}
+   walberla::real_t const & operator()(const data::Particle& p) const {return p.getCharge();}
+};
+///Predicate that selects a certain property from a Particle
 class SelectParticleShapeID
 {
 public:
@@ -1658,6 +1747,33 @@ public:
    walberla::mesa_pd::Vec3 const & operator()(const data::Particle& p) const {return p.getOldHydrodynamicTorque();}
 };
 ///Predicate that selects a certain property from a Particle
+class SelectParticleElectrostaticForce
+{
+public:
+   using return_type = walberla::mesa_pd::Vec3;
+   walberla::mesa_pd::Vec3& operator()(data::Particle& p) const {return p.getElectrostaticForceRef();}
+   walberla::mesa_pd::Vec3& operator()(data::Particle&& p) const {return p.getElectrostaticForceRef();}
+   walberla::mesa_pd::Vec3 const & operator()(const data::Particle& p) const {return p.getElectrostaticForce();}
+};
+///Predicate that selects a certain property from a Particle
+class SelectParticleTotalDisplacement
+{
+public:
+   using return_type = walberla::real_t;
+   walberla::real_t& operator()(data::Particle& p) const {return p.getTotalDisplacementRef();}
+   walberla::real_t& operator()(data::Particle&& p) const {return p.getTotalDisplacementRef();}
+   walberla::real_t const & operator()(const data::Particle& p) const {return p.getTotalDisplacement();}
+};
+///Predicate that selects a certain property from a Particle
+class SelectParticleCollisionForceNorm
+{
+public:
+   using return_type = walberla::real_t;
+   walberla::real_t& operator()(data::Particle& p) const {return p.getCollisionForceNormRef();}
+   walberla::real_t& operator()(data::Particle&& p) const {return p.getCollisionForceNormRef();}
+   walberla::real_t const & operator()(const data::Particle& p) const {return p.getCollisionForceNorm();}
+};
+///Predicate that selects a certain property from a Particle
 class SelectParticleVirtualMass
 {
 public:
diff --git a/src/mesa_pd/kernel/LinearSpringDashpot.h b/src/mesa_pd/kernel/LinearSpringDashpot.h
index b4b35082cba39c5884b757a722baa494a5ad6750..c2d41674b023e11f71316897f9b97905bd21f593 100644
--- a/src/mesa_pd/kernel/LinearSpringDashpot.h
+++ b/src/mesa_pd/kernel/LinearSpringDashpot.h
@@ -344,7 +344,8 @@ inline void LinearSpringDashpot::operator()(const size_t p_idx1,
       const real_t fTabs( std::min( fTLS.length(), fFrictionAbs) );
       const Vec3   fT   ( fTabs * t );
 
-      //TODO check if tangential spring displacement is same for symmetric case
+      // TODO check if tangential spring displacement is same for symmetric case
+      // TODO: check why exactly this critical section is needed
       auto& ch1 = ac.getNewContactHistoryRef(p_idx1)[ac.getUid(p_idx2)];
       ch1.setTangentialSpringDisplacement(newTangentialSpringDisplacement);
       ch1.setIsSticking(isSticking);
diff --git a/src/mesa_pd/mpi/notifications/CMakeLists.txt b/src/mesa_pd/mpi/notifications/CMakeLists.txt
index c50238b775c0bc00aa6986c8c471ab237ba296f0..16e1bac3779869d71835a318d0ba9b758bd47a35 100644
--- a/src/mesa_pd/mpi/notifications/CMakeLists.txt
+++ b/src/mesa_pd/mpi/notifications/CMakeLists.txt
@@ -17,5 +17,6 @@ target_sources( mesa_pd
     HydrodynamicForceTorqueNotification.h
     reset.h
     ForceTorqueNotification.h
-    NotificationType.h     
+    NotificationType.h
+    ElectrostaticForceNotification.h
     )
diff --git a/src/mesa_pd/mpi/notifications/ElectrostaticForceNotification.h b/src/mesa_pd/mpi/notifications/ElectrostaticForceNotification.h
new file mode 100644
index 0000000000000000000000000000000000000000..91eb0d71c7f4afbe9978fd14910cab83a06516bc
--- /dev/null
+++ b/src/mesa_pd/mpi/notifications/ElectrostaticForceNotification.h
@@ -0,0 +1,114 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file
+//! \author Sebastian Eibl <sebastian.eibl@fau.de>
+//
+//======================================================================================================================
+
+//======================================================================================================================
+//
+//  THIS FILE IS GENERATED - PLEASE CHANGE THE TEMPLATE !!!
+//
+//======================================================================================================================
+
+#pragma once
+
+#include <mesa_pd/data/DataTypes.h>
+#include <mesa_pd/data/ParticleStorage.h>
+#include <mesa_pd/mpi/notifications/NotificationType.h>
+#include <mesa_pd/mpi/notifications/reset.h>
+
+#include <core/mpi/Datatype.h>
+#include <core/mpi/RecvBuffer.h>
+#include <core/mpi/SendBuffer.h>
+
+namespace walberla {
+namespace mesa_pd {
+
+/**
+ * Transmits force and torque information.
+ */
+class ElectrostaticForceNotification
+{
+public:
+   struct Parameters
+   {
+      id_t uid_;
+      mesa_pd::Vec3 electrostaticForce_;
+   };
+
+   inline explicit ElectrostaticForceNotification( const data::Particle& p ) : p_(p) {}
+
+   const data::Particle& p_;
+};
+
+template <>
+void reset<ElectrostaticForceNotification>(data::Particle& p)
+{
+   p.setElectrostaticForce( Vec3(real_t(0)) );
+}
+
+void reduce(data::Particle&& p, const ElectrostaticForceNotification::Parameters& objparam)
+{
+   p.getElectrostaticForceRef() += objparam.electrostaticForce_;
+}
+
+void update(data::Particle&& p, const ElectrostaticForceNotification::Parameters& objparam)
+{
+   p.setElectrostaticForce( objparam.electrostaticForce_ );
+}
+
+}  // namespace mesa_pd
+}  // namespace walberla
+
+//======================================================================================================================
+//
+//  Send/Recv Buffer Serialization Specialization
+//
+//======================================================================================================================
+
+namespace walberla {
+namespace mpi {
+
+template< typename T,    // Element type of SendBuffer
+          typename G>    // Growth policy of SendBuffer
+mpi::GenericSendBuffer<T,G>& operator<<( mpi::GenericSendBuffer<T,G> & buf, const mesa_pd::ElectrostaticForceNotification& obj )
+{
+   buf.addDebugMarker( "pn" );
+   buf << obj.p_.getUid();
+   buf << obj.p_.getElectrostaticForce();
+   return buf;
+}
+
+template< typename T>    // Element type  of RecvBuffer
+mpi::GenericRecvBuffer<T>& operator>>( mpi::GenericRecvBuffer<T> & buf, mesa_pd::ElectrostaticForceNotification::Parameters& objparam )
+{
+   buf.readDebugMarker( "pn" );
+   buf >> objparam.uid_;
+   buf >> objparam.electrostaticForce_;
+   return buf;
+}
+
+template< >
+struct BufferSizeTrait< mesa_pd::ElectrostaticForceNotification > {
+   static const bool constantSize = true;
+   static const uint_t size = BufferSizeTrait<id_t>::size +
+                              BufferSizeTrait<mesa_pd::Vec3>::size +
+                              mpi::BUFFER_DEBUG_OVERHEAD;
+};
+
+} // mpi
+} // walberla
\ No newline at end of file
diff --git a/src/mesa_pd/mpi/notifications/ParseMessage.h b/src/mesa_pd/mpi/notifications/ParseMessage.h
index ed3124bf252166d98114bc90d640531ebc002d91..082b02909d7068c79846b0a1a0776c99b4fc68ae 100644
--- a/src/mesa_pd/mpi/notifications/ParseMessage.h
+++ b/src/mesa_pd/mpi/notifications/ParseMessage.h
@@ -115,6 +115,7 @@ void ParseMessage::operator()(int sender,
       pIt->setUid(objparam.uid);
       pIt->setPosition(objparam.position);
       pIt->setLinearVelocity(objparam.linearVelocity);
+      pIt->setCharge(objparam.charge);
       pIt->setRotation(objparam.rotation);
       pIt->setAngularVelocity(objparam.angularVelocity);
       pIt->setRadiusAtTemperature(objparam.radiusAtTemperature);
@@ -153,6 +154,9 @@ void ParseMessage::operator()(int sender,
       pIt->setHydrodynamicTorque(objparam.hydrodynamicTorque_);
       pIt->setOldHydrodynamicForce(objparam.oldHydrodynamicForce_);
       pIt->setOldHydrodynamicTorque(objparam.oldHydrodynamicTorque_);
+      pIt->setElectrostaticForce(objparam.electrostaticForce_);
+      pIt->setTotalDisplacement(objparam.totalDisplacement_);
+      pIt->setCollisionForceNorm(objparam.collisionForceNorm_);
       pIt->setVirtualMass(objparam.virtualMass_);
       pIt->setInvMassIncludingVirtual(objparam.invMassIncludingVirtual_);
       pIt->setOldLinearAcceleration(objparam.oldLinearAcceleration_);
diff --git a/src/mesa_pd/mpi/notifications/ParticleCopyNotification.h b/src/mesa_pd/mpi/notifications/ParticleCopyNotification.h
index 78de9c6eaee587fe1a1fffca9a193dfcba4a6354..23d085b51440a2db1da7ca731530072378816372 100644
--- a/src/mesa_pd/mpi/notifications/ParticleCopyNotification.h
+++ b/src/mesa_pd/mpi/notifications/ParticleCopyNotification.h
@@ -57,6 +57,7 @@ public:
       walberla::mesa_pd::Vec3 linearVelocity {real_t(0)};
       walberla::real_t invMass {real_t(1)};
       walberla::mesa_pd::Vec3 oldForce {real_t(0)};
+      walberla::real_t charge {real_t(0)};
       size_t shapeID {};
       std::shared_ptr<walberla::mesa_pd::data::BaseShape> baseShape {make_shared<walberla::mesa_pd::data::BaseShape>()};
       walberla::mesa_pd::Rot3 rotation {};
@@ -70,6 +71,9 @@ public:
       walberla::mesa_pd::Vec3 hydrodynamicTorque {real_t(0)};
       walberla::mesa_pd::Vec3 oldHydrodynamicForce {real_t(0)};
       walberla::mesa_pd::Vec3 oldHydrodynamicTorque {real_t(0)};
+      walberla::mesa_pd::Vec3 electrostaticForce {real_t(0)};
+      walberla::real_t totalDisplacement {real_t(0)};
+      walberla::real_t collisionForceNorm {real_t(0)};
       walberla::real_t virtualMass {real_t(0)};
       walberla::real_t invMassIncludingVirtual {real_t(0)};
       walberla::mesa_pd::Vec3 oldLinearAcceleration {real_t(0)};
@@ -99,6 +103,7 @@ inline data::ParticleStorage::iterator createNewParticle(data::ParticleStorage&
    pIt->setLinearVelocity(data.linearVelocity);
    pIt->setInvMass(data.invMass);
    pIt->setOldForce(data.oldForce);
+   pIt->setCharge(data.charge);
    pIt->setShapeID(data.shapeID);
    pIt->setBaseShape(data.baseShape);
    pIt->setRotation(data.rotation);
@@ -112,6 +117,9 @@ inline data::ParticleStorage::iterator createNewParticle(data::ParticleStorage&
    pIt->setHydrodynamicTorque(data.hydrodynamicTorque);
    pIt->setOldHydrodynamicForce(data.oldHydrodynamicForce);
    pIt->setOldHydrodynamicTorque(data.oldHydrodynamicTorque);
+   pIt->setElectrostaticForce(data.electrostaticForce);
+   pIt->setTotalDisplacement(data.totalDisplacement);
+   pIt->setCollisionForceNorm(data.collisionForceNorm);
    pIt->setVirtualMass(data.virtualMass);
    pIt->setInvMassIncludingVirtual(data.invMassIncludingVirtual);
    pIt->setOldLinearAcceleration(data.oldLinearAcceleration);
@@ -156,6 +164,7 @@ mpi::GenericSendBuffer<T,G>& operator<<( mpi::GenericSendBuffer<T,G> & buf, cons
    buf << obj.particle_.getLinearVelocity();
    buf << obj.particle_.getInvMass();
    buf << obj.particle_.getOldForce();
+   buf << obj.particle_.getCharge();
    buf << obj.particle_.getShapeID();
    buf << obj.particle_.getBaseShape();
    buf << obj.particle_.getRotation();
@@ -169,6 +178,9 @@ mpi::GenericSendBuffer<T,G>& operator<<( mpi::GenericSendBuffer<T,G> & buf, cons
    buf << obj.particle_.getHydrodynamicTorque();
    buf << obj.particle_.getOldHydrodynamicForce();
    buf << obj.particle_.getOldHydrodynamicTorque();
+   buf << obj.particle_.getElectrostaticForce();
+   buf << obj.particle_.getTotalDisplacement();
+   buf << obj.particle_.getCollisionForceNorm();
    buf << obj.particle_.getVirtualMass();
    buf << obj.particle_.getInvMassIncludingVirtual();
    buf << obj.particle_.getOldLinearAcceleration();
@@ -194,6 +206,7 @@ mpi::GenericRecvBuffer<T>& operator>>( mpi::GenericRecvBuffer<T> & buf, mesa_pd:
    buf >> objparam.linearVelocity;
    buf >> objparam.invMass;
    buf >> objparam.oldForce;
+   buf >> objparam.charge;
    buf >> objparam.shapeID;
    buf >> objparam.baseShape;
    buf >> objparam.rotation;
@@ -207,6 +220,9 @@ mpi::GenericRecvBuffer<T>& operator>>( mpi::GenericRecvBuffer<T> & buf, mesa_pd:
    buf >> objparam.hydrodynamicTorque;
    buf >> objparam.oldHydrodynamicForce;
    buf >> objparam.oldHydrodynamicTorque;
+   buf >> objparam.electrostaticForce;
+   buf >> objparam.totalDisplacement;
+   buf >> objparam.collisionForceNorm;
    buf >> objparam.virtualMass;
    buf >> objparam.invMassIncludingVirtual;
    buf >> objparam.oldLinearAcceleration;
diff --git a/src/mesa_pd/mpi/notifications/ParticleGhostCopyNotification.h b/src/mesa_pd/mpi/notifications/ParticleGhostCopyNotification.h
index fa87d8571fd11db5aa83b881a9461d9befb7bc3b..12ee33efad8e64abfeee2250226525b848381d4d 100644
--- a/src/mesa_pd/mpi/notifications/ParticleGhostCopyNotification.h
+++ b/src/mesa_pd/mpi/notifications/ParticleGhostCopyNotification.h
@@ -55,6 +55,7 @@ public:
       int owner {-1};
       walberla::mesa_pd::Vec3 linearVelocity {real_t(0)};
       walberla::real_t invMass {real_t(1)};
+      walberla::real_t charge {real_t(0)};
       size_t shapeID {};
       std::shared_ptr<walberla::mesa_pd::data::BaseShape> baseShape {make_shared<walberla::mesa_pd::data::BaseShape>()};
       walberla::mesa_pd::Rot3 rotation {};
@@ -83,6 +84,7 @@ inline data::ParticleStorage::iterator createNewParticle(data::ParticleStorage&
    pIt->setOwner(data.owner);
    pIt->setLinearVelocity(data.linearVelocity);
    pIt->setInvMass(data.invMass);
+   pIt->setCharge(data.charge);
    pIt->setShapeID(data.shapeID);
    pIt->setBaseShape(data.baseShape);
    pIt->setRotation(data.rotation);
@@ -126,6 +128,7 @@ mpi::GenericSendBuffer<T,G>& operator<<( mpi::GenericSendBuffer<T,G> & buf, cons
    buf << obj.particle_.getOwner();
    buf << obj.particle_.getLinearVelocity();
    buf << obj.particle_.getInvMass();
+   buf << obj.particle_.getCharge();
    buf << obj.particle_.getShapeID();
    buf << obj.particle_.getBaseShape();
    buf << obj.particle_.getRotation();
@@ -150,6 +153,7 @@ mpi::GenericRecvBuffer<T>& operator>>( mpi::GenericRecvBuffer<T> & buf, mesa_pd:
    buf >> objparam.owner;
    buf >> objparam.linearVelocity;
    buf >> objparam.invMass;
+   buf >> objparam.charge;
    buf >> objparam.shapeID;
    buf >> objparam.baseShape;
    buf >> objparam.rotation;
diff --git a/src/mesa_pd/mpi/notifications/ParticleMigrationNotification.h b/src/mesa_pd/mpi/notifications/ParticleMigrationNotification.h
index d7cf2b54d86bc57b5875130e5f7edf3c0f8d98b0..efaf5ff61897b6548fe2b0e47abfcc159ca9cbd6 100644
--- a/src/mesa_pd/mpi/notifications/ParticleMigrationNotification.h
+++ b/src/mesa_pd/mpi/notifications/ParticleMigrationNotification.h
@@ -52,6 +52,9 @@ public:
       walberla::mesa_pd::Vec3 hydrodynamicTorque_ {real_t(0)};
       walberla::mesa_pd::Vec3 oldHydrodynamicForce_ {real_t(0)};
       walberla::mesa_pd::Vec3 oldHydrodynamicTorque_ {real_t(0)};
+      walberla::mesa_pd::Vec3 electrostaticForce_ {real_t(0)};
+      walberla::real_t totalDisplacement_ {real_t(0)};
+      walberla::real_t collisionForceNorm_ {real_t(0)};
       walberla::real_t virtualMass_ {real_t(0)};
       walberla::real_t invMassIncludingVirtual_ {real_t(0)};
       walberla::mesa_pd::Vec3 oldLinearAcceleration_ {real_t(0)};
@@ -96,6 +99,9 @@ mpi::GenericSendBuffer<T,G>& operator<<( mpi::GenericSendBuffer<T,G> & buf, cons
    buf << obj.particle_.getHydrodynamicTorque();
    buf << obj.particle_.getOldHydrodynamicForce();
    buf << obj.particle_.getOldHydrodynamicTorque();
+   buf << obj.particle_.getElectrostaticForce();
+   buf << obj.particle_.getTotalDisplacement();
+   buf << obj.particle_.getCollisionForceNorm();
    buf << obj.particle_.getVirtualMass();
    buf << obj.particle_.getInvMassIncludingVirtual();
    buf << obj.particle_.getOldLinearAcceleration();
@@ -118,6 +124,9 @@ mpi::GenericRecvBuffer<T>& operator>>( mpi::GenericRecvBuffer<T> & buf, mesa_pd:
    buf >> objparam.hydrodynamicTorque_;
    buf >> objparam.oldHydrodynamicForce_;
    buf >> objparam.oldHydrodynamicTorque_;
+   buf >> objparam.electrostaticForce_;
+   buf >> objparam.totalDisplacement_;
+   buf >> objparam.collisionForceNorm_;
    buf >> objparam.virtualMass_;
    buf >> objparam.invMassIncludingVirtual_;
    buf >> objparam.oldLinearAcceleration_;
diff --git a/src/mesa_pd/mpi/notifications/ParticleUpdateNotification.h b/src/mesa_pd/mpi/notifications/ParticleUpdateNotification.h
index 95d0d7e37acc742b4cff1fce9442398f9a3a3042..73bc47acfc62602e7d3130c9ad09362efd18500f 100644
--- a/src/mesa_pd/mpi/notifications/ParticleUpdateNotification.h
+++ b/src/mesa_pd/mpi/notifications/ParticleUpdateNotification.h
@@ -48,6 +48,7 @@ public:
    walberla::id_t uid {UniqueID<data::Particle>::invalidID()};
    walberla::mesa_pd::Vec3 position {real_t(0)};
    walberla::mesa_pd::Vec3 linearVelocity {real_t(0)};
+   walberla::real_t charge {real_t(0)};
    walberla::mesa_pd::Rot3 rotation {};
    walberla::mesa_pd::Vec3 angularVelocity {real_t(0)};
    walberla::real_t radiusAtTemperature {real_t(0)};
@@ -85,6 +86,7 @@ mpi::GenericSendBuffer<T,G>& operator<<( mpi::GenericSendBuffer<T,G> & buf, cons
    buf << obj.particle_.getUid();
    buf << obj.particle_.getPosition();
    buf << obj.particle_.getLinearVelocity();
+   buf << obj.particle_.getCharge();
    buf << obj.particle_.getRotation();
    buf << obj.particle_.getAngularVelocity();
    buf << obj.particle_.getRadiusAtTemperature();
@@ -100,6 +102,7 @@ mpi::GenericRecvBuffer<T>& operator>>( mpi::GenericRecvBuffer<T> & buf, mesa_pd:
    buf >> objparam.uid;
    buf >> objparam.position;
    buf >> objparam.linearVelocity;
+   buf >> objparam.charge;
    buf >> objparam.rotation;
    buf >> objparam.angularVelocity;
    buf >> objparam.radiusAtTemperature;
diff --git a/tests/lbm_mesapd_coupling/CMakeLists.txt b/tests/lbm_mesapd_coupling/CMakeLists.txt
index 94438a2d9863101d05ea984eb59ff3f58f6aad2c..b6c8f15397a1317a5643d488b8b685f0fe37e43a 100644
--- a/tests/lbm_mesapd_coupling/CMakeLists.txt
+++ b/tests/lbm_mesapd_coupling/CMakeLists.txt
@@ -7,13 +7,13 @@
 waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_Mapping FILES mapping/ParticleMapping.cpp DEPENDS core mesa_pd lbm lbm_mesapd_coupling domain_decomposition field )
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_Mapping PROCESSES 3)
 
-waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_MEM_MovingMapping FILES momentum_exchange_method/MovingParticleMapping.cpp DEPENDS core mesa_pd lbm lbm_mesapd_coupling domain_decomposition field vtk)
+waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_MEM_MovingMapping FILES momentum_exchange_method/MovingParticleMappingMEM.cpp DEPENDS core mesa_pd lbm lbm_mesapd_coupling domain_decomposition field vtk)
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_MEM_MovingMapping PROCESSES 3)
 
-waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_MEM_DragForceSphere FILES momentum_exchange_method/DragForceSphere.cpp DEPENDS core mesa_pd lbm lbm_mesapd_coupling domain_decomposition field vtk )
+waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_MEM_DragForceSphere FILES momentum_exchange_method/DragForceSphereMEM.cpp DEPENDS core mesa_pd lbm lbm_mesapd_coupling domain_decomposition field vtk )
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_MEM_DragForceSphere COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_MEM_DragForceSphere> --funcTest PROCESSES 2)
 
-waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_MEM_ForceTwoStatObjs FILES momentum_exchange_method/ForceBetweenTwoStationaryObjects.cpp DEPENDS core mesa_pd lbm lbm_mesapd_coupling domain_decomposition field vtk )
+waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_MEM_ForceTwoStatObjs FILES momentum_exchange_method/ForceBetweenTwoStationaryObjectsMEM.cpp DEPENDS core mesa_pd lbm lbm_mesapd_coupling domain_decomposition field vtk )
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_MEM_ForceTwoStatObjsSS1 COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_MEM_ForceTwoStatObjs> PROCESSES 1)
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_MEM_ForceTwoStatObjsSS2 COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_MEM_ForceTwoStatObjs> --useSBB PROCESSES 1)
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_MEM_ForceTwoStatObjsSS3 COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_MEM_ForceTwoStatObjs> --useCompressible PROCESSES 1)
@@ -23,13 +23,13 @@ waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_MEM_ForceTwoStatObjsSW2 COMMAND
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_MEM_ForceTwoStatObjsSW3 COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_MEM_ForceTwoStatObjs> --useSphereWallSetup --useCompressible PROCESSES 1)
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_MEM_ForceTwoStatObjsSW4 COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_MEM_ForceTwoStatObjs> --useSphereWallSetup --systemVelocity 0.1 PROCESSES 1)
 
-waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_MEM_SettlingSphere FILES momentum_exchange_method/SettlingSphere.cpp DEPENDS core mesa_pd lbm lbm_mesapd_coupling domain_decomposition field vtk )
+waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_MEM_SettlingSphere FILES momentum_exchange_method/SettlingSphereMEM.cpp DEPENDS core mesa_pd lbm lbm_mesapd_coupling domain_decomposition field vtk )
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_MEM_SettlingSphere COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_MEM_SettlingSphere> --funcTest PROCESSES 4)
 
-waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_MEM_PdfReconstruction FILES momentum_exchange_method/PdfReconstruction.cpp DEPENDS core mesa_pd lbm lbm_mesapd_coupling domain_decomposition field vtk)
+waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_MEM_PdfReconstruction FILES momentum_exchange_method/PdfReconstructionMEM.cpp DEPENDS core mesa_pd lbm lbm_mesapd_coupling domain_decomposition field vtk)
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_MEM_PdfReconstruction PROCESSES 3)
 
-waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_MEM_UpdateParticleMapping FILES momentum_exchange_method/UpdateParticleMapping.cpp DEPENDS core mesa_pd lbm lbm_mesapd_coupling domain_decomposition field vtk)
+waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_MEM_UpdateParticleMapping FILES momentum_exchange_method/UpdateParticleMappingMEM.cpp DEPENDS core mesa_pd lbm lbm_mesapd_coupling domain_decomposition field vtk)
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_MEM_UpdateParticleMapping PROCESSES 1)
 
 waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_UTIL_LubricationCorrection FILES utility/LubricationCorrection.cpp DEPENDS mesa_pd lbm_mesapd_coupling )
@@ -47,18 +47,21 @@ waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_UTIL_HydForceMultBlocks_VVAvg CO
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_UTIL_HydForceMultBlocks_EulerNoAvg COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_UTIL_HydForceMultBlocks> --noForceAveraging PROCESSES 4 )
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_UTIL_HydForceMultBlocks_VVNoAvg COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_UTIL_HydForceMultBlocks> --noForceAveraging --useVV PROCESSES 4 )
 
-waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_PSM_BodyAndVolumeFractionMapping FILES partially_saturated_cells_method/ParticleAndVolumeFractionMapping.cpp DEPENDS blockforest boundary core field lbm_mesapd_coupling stencil mesa_pd )
-waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_PSM_BodyAndVolumeFractionMapping PROCESSES 27 )
+waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_PSM_ParticleAndVolumeFractionMapping FILES partially_saturated_cells_method/ParticleAndVolumeFractionMappingPSM.cpp DEPENDS blockforest boundary core field lbm_mesapd_coupling stencil mesa_pd )
+waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_PSM_ParticleAndVolumeFractionMapping COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_ParticleAndVolumeFractionMapping> PROCESSES 27 )
 
-waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_PSM_DragForceSphere FILES partially_saturated_cells_method/DragForceSphere.cpp DEPENDS blockforest boundary core field lbm lbm_mesapd_coupling timeloop mesa_pd )
+waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_PSM_ParticleAndVolumeFractionMapping_CPU_GPU FILES partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMapping.cpp DEPENDS blockforest boundary core gpu field lbm_mesapd_coupling stencil mesa_pd )
+waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_PSM_ParticleAndVolumeFractionMapping_CPU_GPU COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_ParticleAndVolumeFractionMapping_CPU_GPU> PROCESSES 27 )
+
+waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_PSM_DragForceSphere FILES partially_saturated_cells_method/DragForceSpherePSM.cpp DEPENDS blockforest boundary core field lbm lbm_mesapd_coupling timeloop mesa_pd )
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_PSM_DragForceSphereFuncTest COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_DragForceSphere> --funcTest PROCESSES 8 )
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_PSM_DragForceSphere COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_DragForceSphere> PROCESSES 8 LABELS longrun CONFIGURATIONS Release RelWithDbgInfo)
 
-waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_PSM_SettlingSphere FILES partially_saturated_cells_method/SettlingSphere.cpp DEPENDS blockforest boundary core domain_decomposition field lbm lbm_mesapd_coupling timeloop mesa_pd )
+waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_PSM_SettlingSphere FILES partially_saturated_cells_method/SettlingSpherePSM.cpp DEPENDS blockforest boundary core domain_decomposition field lbm lbm_mesapd_coupling timeloop mesa_pd )
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_PSM_SettlingSphereFuncTest COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_SettlingSphere> --funcTest PROCESSES 4 )
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_PSM_SettlingSphere COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_SettlingSphere> --resolution 70 PROCESSES 4 LABELS longrun CONFIGURATIONS Release RelWithDbgInfo )
 
-waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_PSM_TorqueSphere FILES partially_saturated_cells_method/TorqueSphere.cpp DEPENDS blockforest boundary core domain_decomposition field lbm stencil timeloop )
+waLBerla_compile_test( NAME LBM_MESAPD_COUPLING_PSM_TorqueSphere FILES partially_saturated_cells_method/TorqueSpherePSM.cpp DEPENDS blockforest boundary core domain_decomposition field lbm stencil timeloop )
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_PSM_TorqueSphereSC1W1FuncTest     COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_TorqueSphere> --funcTest --SC1W1   PROCESSES 1 )
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_PSM_TorqueSphereSC1W1SingleTest   COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_TorqueSphere> --SC1W1              PROCESSES 1 LABELS longrun     CONFIGURATIONS Release RelWithDbgInfo )
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_PSM_TorqueSphereSC1W1ParallelTest COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_TorqueSphere> --SC1W1              PROCESSES 8 LABELS verylongrun CONFIGURATIONS Release RelWithDbgInfo )
@@ -72,3 +75,28 @@ waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_PSM_TorqueSphereSC2W2FuncTest
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_PSM_TorqueSphereSC2W2SingleTest   COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_TorqueSphere> --SC2W2              PROCESSES 1 LABELS longrun     CONFIGURATIONS Release RelWithDbgInfo )
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_PSM_TorqueSphereSC3W2FuncTest     COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_TorqueSphere> --funcTest --SC3W2   PROCESSES 1 )
 waLBerla_execute_test( NAME LBM_MESAPD_COUPLING_PSM_TorqueSphereSC3W2SingleTest   COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_TorqueSphere> --SC3W2              PROCESSES 1 LABELS longrun     CONFIGURATIONS Release RelWithDbgInfo )
+
+if (WALBERLA_BUILD_WITH_CODEGEN)
+    if (NOT WALBERLA_BUILD_WITH_GPU_SUPPORT OR (WALBERLA_BUILD_WITH_GPU_SUPPORT AND (CMAKE_CUDA_ARCHITECTURES GREATER_EQUAL 60 OR WALBERLA_BUILD_WITH_HIP)))
+        waLBerla_compile_test(NAME LBM_MESAPD_COUPLING_PSM_DragForceSphere_CPU_GPU FILES partially_saturated_cells_method/codegen/DragForceSpherePSM.cpp DEPENDS blockforest core gpu field lbm_mesapd_coupling mesa_pd PSMCodegenPython_srt_sc1)
+        waLBerla_execute_test(NAME LBM_MESAPD_COUPLING_PSM_DragForceSphere_CPU_GPU_FuncTest COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_DragForceSphere_CPU_GPU> --funcTest PROCESSES 1)
+        waLBerla_execute_test(NAME LBM_MESAPD_COUPLING_PSM_DragForceSphere_CPU_GPU COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_DragForceSphere_CPU_GPU> PROCESSES 8 LABELS verylongrun CONFIGURATIONS Release RelWithDbgInfo)
+
+        foreach (collision_setup srt trt-smagorinsky)
+            foreach (solid_collision 1 2 3)
+                foreach (weighting 1 2)
+                    set(config ${collision_setup}_sc${solid_collision}_w${weighting})
+                    waLBerla_compile_test(NAME LBM_MESAPD_COUPLING_PSM_TorqueSphere_CPU_GPU_${config} FILES partially_saturated_cells_method/codegen/TorqueSpherePSM.cpp DEPENDS blockforest core gpu field lbm_mesapd_coupling mesa_pd PSMCodegenPython_${collision_setup}_sc${solid_collision})
+                    target_compile_definitions(LBM_MESAPD_COUPLING_PSM_TorqueSphere_CPU_GPU_${config} PRIVATE SC=${solid_collision})
+                    target_compile_definitions(LBM_MESAPD_COUPLING_PSM_TorqueSphere_CPU_GPU_${config} PRIVATE Weighting=${weighting})
+                    waLBerla_execute_test(NAME LBM_MESAPD_COUPLING_PSM_TorqueSphere_CPU_GPU_${config}_FuncTest COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_TorqueSphere_CPU_GPU_${config}> --funcTest PROCESSES 1)
+                    waLBerla_execute_test(NAME LBM_MESAPD_COUPLING_PSM_TorqueSphere_CPU_GPU_${config} COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_TorqueSphere_CPU_GPU_${config}> PROCESSES 8 LABELS longrun CONFIGURATIONS Release RelWithDbgInfo)
+                endforeach ()
+            endforeach ()
+        endforeach ()
+
+        waLBerla_compile_test(NAME LBM_MESAPD_COUPLING_PSM_SettlingSphere_CPU_GPU FILES partially_saturated_cells_method/codegen/SettlingSpherePSM.cpp DEPENDS blockforest core gpu field lbm lbm_mesapd_coupling mesa_pd timeloop vtk PSMCodegenPython_trt-smagorinsky_sc2)
+        waLBerla_execute_test(NAME LBM_MESAPD_COUPLING_PSM_SettlingSphere_CPU_GPU_FuncTest COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_SettlingSphere_CPU_GPU> --funcTest PROCESSES 1)
+        waLBerla_execute_test(NAME LBM_MESAPD_COUPLING_PSM_SettlingSphere_CPU_GPU COMMAND $<TARGET_FILE:LBM_MESAPD_COUPLING_PSM_SettlingSphere_CPU_GPU> PROCESSES 8 LABELS verylongrun CONFIGURATIONS Release RelWithDbgInfo)
+    endif ()
+endif ()
diff --git a/tests/lbm_mesapd_coupling/momentum_exchange_method/DragForceSphere.cpp b/tests/lbm_mesapd_coupling/momentum_exchange_method/DragForceSphereMEM.cpp
similarity index 100%
rename from tests/lbm_mesapd_coupling/momentum_exchange_method/DragForceSphere.cpp
rename to tests/lbm_mesapd_coupling/momentum_exchange_method/DragForceSphereMEM.cpp
diff --git a/tests/lbm_mesapd_coupling/momentum_exchange_method/ForceBetweenTwoStationaryObjects.cpp b/tests/lbm_mesapd_coupling/momentum_exchange_method/ForceBetweenTwoStationaryObjectsMEM.cpp
similarity index 100%
rename from tests/lbm_mesapd_coupling/momentum_exchange_method/ForceBetweenTwoStationaryObjects.cpp
rename to tests/lbm_mesapd_coupling/momentum_exchange_method/ForceBetweenTwoStationaryObjectsMEM.cpp
diff --git a/tests/lbm_mesapd_coupling/momentum_exchange_method/MovingParticleMapping.cpp b/tests/lbm_mesapd_coupling/momentum_exchange_method/MovingParticleMappingMEM.cpp
similarity index 100%
rename from tests/lbm_mesapd_coupling/momentum_exchange_method/MovingParticleMapping.cpp
rename to tests/lbm_mesapd_coupling/momentum_exchange_method/MovingParticleMappingMEM.cpp
diff --git a/tests/lbm_mesapd_coupling/momentum_exchange_method/PdfReconstruction.cpp b/tests/lbm_mesapd_coupling/momentum_exchange_method/PdfReconstructionMEM.cpp
similarity index 100%
rename from tests/lbm_mesapd_coupling/momentum_exchange_method/PdfReconstruction.cpp
rename to tests/lbm_mesapd_coupling/momentum_exchange_method/PdfReconstructionMEM.cpp
diff --git a/tests/lbm_mesapd_coupling/momentum_exchange_method/SettlingSphere.cpp b/tests/lbm_mesapd_coupling/momentum_exchange_method/SettlingSphereMEM.cpp
similarity index 100%
rename from tests/lbm_mesapd_coupling/momentum_exchange_method/SettlingSphere.cpp
rename to tests/lbm_mesapd_coupling/momentum_exchange_method/SettlingSphereMEM.cpp
diff --git a/tests/lbm_mesapd_coupling/momentum_exchange_method/UpdateParticleMapping.cpp b/tests/lbm_mesapd_coupling/momentum_exchange_method/UpdateParticleMappingMEM.cpp
similarity index 100%
rename from tests/lbm_mesapd_coupling/momentum_exchange_method/UpdateParticleMapping.cpp
rename to tests/lbm_mesapd_coupling/momentum_exchange_method/UpdateParticleMappingMEM.cpp
diff --git a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/DragForceSphere.cpp b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/DragForceSpherePSM.cpp
similarity index 90%
rename from tests/lbm_mesapd_coupling/partially_saturated_cells_method/DragForceSphere.cpp
rename to tests/lbm_mesapd_coupling/partially_saturated_cells_method/DragForceSpherePSM.cpp
index 151b109c07e083b7d5adc55cb28a1dd0fce0e4eb..12bc2a240dd37765d161ebc3556c78d370546fa1 100644
--- a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/DragForceSphere.cpp
+++ b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/DragForceSpherePSM.cpp
@@ -36,6 +36,7 @@
 #include "core/timing/RemainingTimeLogger.h"
 
 #include "field/AddToStorage.h"
+#include "field/vtk/VTKWriter.h"
 
 #include "lbm/communication/PdfFieldPackInfo.h"
 #include "lbm/field/AddToStorage.h"
@@ -43,10 +44,9 @@
 #include "lbm/lattice_model/D3Q19.h"
 #include "lbm/lattice_model/ForceModel.h"
 #include "lbm/sweeps/SweepWrappers.h"
+#include "lbm/vtk/Velocity.h"
 
 #include "lbm_mesapd_coupling/DataTypes.h"
-#include "lbm_mesapd_coupling/momentum_exchange_method/MovingParticleMapping.h"
-#include "lbm_mesapd_coupling/momentum_exchange_method/boundary/SimpleBB.h"
 #include "lbm_mesapd_coupling/partially_saturated_cells_method/PSMSweep.h"
 #include "lbm_mesapd_coupling/partially_saturated_cells_method/PSMUtility.h"
 #include "lbm_mesapd_coupling/partially_saturated_cells_method/ParticleAndVolumeFractionMapping.h"
@@ -103,9 +103,10 @@ class DragForceEvaluator
 {
  public:
    DragForceEvaluator(SweepTimeloop* timeloop, Setup* setup, const shared_ptr< StructuredBlockStorage >& blocks,
-                      const BlockDataID& pdfFieldID, const shared_ptr< ParticleAccessor_T >& ac,
-                      walberla::id_t sphereID)
-      : timeloop_(timeloop), setup_(setup), blocks_(blocks), pdfFieldID_(pdfFieldID), ac_(ac), sphereID_(sphereID),
+                      const BlockDataID& pdfFieldID, const BlockDataID& particleAndVolumeFractionFieldID,
+                      const shared_ptr< ParticleAccessor_T >& ac, walberla::id_t sphereID)
+      : timeloop_(timeloop), setup_(setup), blocks_(blocks), pdfFieldID_(pdfFieldID),
+        particleAndVolumeFractionFieldID_(particleAndVolumeFractionFieldID), ac_(ac), sphereID_(sphereID),
         normalizedDragOld_(0.0), normalizedDragNew_(0.0)
    {
       // calculate the analytical drag force value based on the series expansion of chi
@@ -198,13 +199,18 @@ class DragForceEvaluator
       {
          // retrieve the pdf field and the flag field from the block
          PdfField_T* pdfField = blockIt->getData< PdfField_T >(pdfFieldID_);
+         lbm_mesapd_coupling::psm::ParticleAndVolumeFractionField_T* particleAndVolumeFractionField =
+            blockIt->getData< lbm_mesapd_coupling::psm::ParticleAndVolumeFractionField_T >(
+               particleAndVolumeFractionFieldID_);
 
          // get the flag that marks a cell as being fluid
 
          auto xyzField = pdfField->xyzSize();
          for (auto cell : xyzField)
          {
-            velocity_sum += pdfField->getVelocity(cell)[0];
+            // TODO: weighting is fixed to 1
+            velocity_sum += lbm_mesapd_coupling::psm::getPSMMacroscopicVelocity< LatticeModel_T, 1 >(
+               *blockIt, pdfField, particleAndVolumeFractionField, *blocks_, cell, *ac_)[0];
          }
       }
 
@@ -219,6 +225,7 @@ class DragForceEvaluator
 
    shared_ptr< StructuredBlockStorage > blocks_;
    const BlockDataID pdfFieldID_;
+   const BlockDataID particleAndVolumeFractionFieldID_;
 
    shared_ptr< ParticleAccessor_T > ac_;
    const walberla::id_t sphereID_;
@@ -278,11 +285,12 @@ int main(int argc, char** argv)
    // Customization //
    ///////////////////
 
-   bool shortrun = false;
-   bool funcTest = false;
-   bool logging  = false;
-   real_t tau    = real_c(1.5);
-   uint_t length = uint_c(32);
+   bool shortrun       = false;
+   bool funcTest       = false;
+   bool logging        = false;
+   uint_t vtkFrequency = uint_c(0);
+   real_t tau          = real_c(1.5);
+   uint_t length       = uint_c(32);
 
    for (int i = 1; i < argc; ++i)
    {
@@ -311,6 +319,11 @@ int main(int argc, char** argv)
          length = uint_c(std::atof(argv[++i]));
          continue;
       }
+      if (std::strcmp(argv[i], "--vtkFrequency") == 0)
+      {
+         vtkFrequency = uint_c(std::atof(argv[++i]));
+         continue;
+      }
       WALBERLA_ABORT("Unrecognized command line argument found: " << argv[i]);
    }
 
@@ -440,7 +453,8 @@ int main(int argc, char** argv)
 
    // add LBM communication function and streaming & force evaluation
    using DragForceEval_T = DragForceEvaluator< ParticleAccessor_T >;
-   auto forceEval        = make_shared< DragForceEval_T >(&timeloop, &setup, blocks, pdfFieldID, accessor, sphereID);
+   auto forceEval        = make_shared< DragForceEval_T >(&timeloop, &setup, blocks, pdfFieldID,
+                                                   particleAndVolumeFractionFieldID, accessor, sphereID);
    timeloop.add() << BeforeFunction(optimizedPDFCommunicationScheme, "LBM Communication")
                   << Sweep(lbm::makeStreamSweep(sweep), "cell-wise LB sweep (stream)")
                   << AfterFunction(SharedFunctor< DragForceEval_T >(forceEval), "drag force evaluation");
@@ -454,7 +468,17 @@ int main(int argc, char** argv)
       "reset force on sphere");
 
    timeloop.addFuncAfterTimeStep(RemainingTimeLogger(timeloop.getNrOfTimeSteps()), "Remaining Time Logger");
+   if (vtkFrequency > 0)
+   {
+      const std::string path = "vtk_out/dragForceSphere";
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "psm_velocity_field", vtkFrequency, 0, false, path,
+                                                      "simulation_step", false, true, true, false, 0);
+
+      auto velWriter = make_shared< walberla::lbm::VelocityVTKWriter< LatticeModel_T > >(pdfFieldID, "Velocity");
+      vtkOutput->addCellDataWriter(velWriter);
 
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
    ////////////////////////
    // EXECUTE SIMULATION //
    ////////////////////////
diff --git a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/ParticleAndVolumeFractionMapping.cpp b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/ParticleAndVolumeFractionMappingPSM.cpp
similarity index 98%
rename from tests/lbm_mesapd_coupling/partially_saturated_cells_method/ParticleAndVolumeFractionMapping.cpp
rename to tests/lbm_mesapd_coupling/partially_saturated_cells_method/ParticleAndVolumeFractionMappingPSM.cpp
index d1bd029a7fa4109d2390b14f93744d2b8d6b4d75..b49590aedd345de8fd8efa5bfbaaa54a0f75d93c 100644
--- a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/ParticleAndVolumeFractionMapping.cpp
+++ b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/ParticleAndVolumeFractionMappingPSM.cpp
@@ -146,9 +146,8 @@ int main(int argc, char** argv)
 
    // set up synchronization
    std::function< void(void) > syncCall = [&]() {
-      const real_t overlap = real_t(1.5) * dx;
       mesa_pd::mpi::SyncNextNeighbors syncNextNeighborFunc;
-      syncNextNeighborFunc(*ps, *mesapdDomain, overlap);
+      syncNextNeighborFunc(*ps, *mesapdDomain);
    };
 
    // add the sphere in the center of the domain
diff --git a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/SettlingSphere.cpp b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/SettlingSpherePSM.cpp
similarity index 99%
rename from tests/lbm_mesapd_coupling/partially_saturated_cells_method/SettlingSphere.cpp
rename to tests/lbm_mesapd_coupling/partially_saturated_cells_method/SettlingSpherePSM.cpp
index 8144d27523f2d72c57bc08e735beb2f8f8d6adf7..f0dd609111994eecf7b0d5519ccd9e2da5376c51 100644
--- a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/SettlingSphere.cpp
+++ b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/SettlingSpherePSM.cpp
@@ -592,9 +592,8 @@ int main(int argc, char** argv)
 
    // set up RPD functionality
    std::function< void(void) > syncCall = [ps, rpdDomain]() {
-      const real_t overlap = real_t(1.5);
       mesa_pd::mpi::SyncNextNeighbors syncNextNeighborFunc;
-      syncNextNeighborFunc(*ps, *rpdDomain, overlap);
+      syncNextNeighborFunc(*ps, *rpdDomain);
    };
 
    syncCall();
diff --git a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/TorqueSphere.cpp b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/TorqueSpherePSM.cpp
similarity index 98%
rename from tests/lbm_mesapd_coupling/partially_saturated_cells_method/TorqueSphere.cpp
rename to tests/lbm_mesapd_coupling/partially_saturated_cells_method/TorqueSpherePSM.cpp
index f0db14fe5d902f80b1e5f293ef6dc0b5a50f64ca..419897653e391094f62a668e2ec3602395fd4165 100644
--- a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/TorqueSphere.cpp
+++ b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/TorqueSpherePSM.cpp
@@ -57,8 +57,6 @@
 #include "mesa_pd/domain/BlockForestDomain.h"
 #include "mesa_pd/mpi/SyncNextNeighbors.h"
 
-#include "stencil/D3Q27.h"
-
 #include "timeloop/SweepTimeloop.h"
 
 #include <iostream>
@@ -392,11 +390,10 @@ int main(int argc, char** argv)
       sphereParticle->setInteractionRadius(setup.radius);
    }
 
-   // synchronize often enough for large bodies
+   // synchronize often enough for large particles
    std::function< void(void) > syncCall = [&]() {
-      // const real_t overlap = real_t(1.5) * dx;
       mesa_pd::mpi::SyncNextNeighbors syncNextNeighborFunc;
-      syncNextNeighborFunc(*ps, *mesapdDomain, overlap);
+      syncNextNeighborFunc(*ps, *mesapdDomain);
    };
 
    syncCall();
@@ -418,7 +415,7 @@ int main(int argc, char** argv)
       field::addToStorage< lbm_mesapd_coupling::psm::ParticleAndVolumeFractionField_T >(
          blocks, "particle and volume fraction field",
          std::vector< lbm_mesapd_coupling::psm::ParticleAndVolumeFraction_T >(), field::fzyx, 0);
-   // map bodies and calculate solid volume fraction initially
+   // map particles and calculate solid volume fraction initially
    lbm_mesapd_coupling::psm::ParticleAndVolumeFractionMapping particleMapping(
       blocks, accessor, lbm_mesapd_coupling::RegularParticlesSelector(), particleAndVolumeFractionFieldID, 4);
    particleMapping();
@@ -445,7 +442,7 @@ int main(int argc, char** argv)
    // setup of the LBM communication for synchronizing the pdf field between neighboring blocks
    std::function< void() > commFunction;
 
-   blockforest::communication::UniformBufferedScheme< stencil::D3Q27 > scheme(blocks);
+   blockforest::communication::UniformBufferedScheme< Stencil_T > scheme(blocks);
    scheme.addPackInfo(make_shared< field::communication::PackInfo< PdfField_T > >(pdfFieldID));
    commFunction = scheme;
 
diff --git a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/DragForceSpherePSM.cpp b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/DragForceSpherePSM.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..84edc556741f33c130d5160f5e2dac06eb24cf2d
--- /dev/null
+++ b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/DragForceSpherePSM.cpp
@@ -0,0 +1,552 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file DragForceSpherePSMCPUGPU.cpp
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//! \brief Modification of partially_saturated_cells_method/DragForceSphere.cpp
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "core/Environment.h"
+#include "core/SharedFunctor.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Logging.h"
+#include "core/mpi/MPIManager.h"
+#include "core/mpi/Reduce.h"
+#include "core/timing/RemainingTimeLogger.h"
+
+#include "field/vtk/VTKWriter.h"
+
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/DeviceSelectMPI.h"
+#include "gpu/communication/UniformGPUScheme.h"
+
+#include "lbm_mesapd_coupling/DataTypesCodegen.h"
+#include "lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMSweepCollection.h"
+#include "lbm_mesapd_coupling/utility/ResetHydrodynamicForceTorqueKernel.h"
+
+#include "mesa_pd/data/ParticleAccessorWithShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/ShapeStorage.h"
+#include "mesa_pd/domain/BlockForestDomain.h"
+#include "mesa_pd/kernel/ParticleSelector.h"
+
+#include <iostream>
+
+// codegen
+#include "InitializeDomainForPSM.h"
+#include "PSMPackInfo.h"
+#include "PSMSweep.h"
+#include "PSM_InfoHeader.h"
+#include "PSM_MacroGetter.h"
+
+namespace drag_force_sphere_psm
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+using walberla::uint_t;
+using namespace lbm_mesapd_coupling::psm::gpu;
+
+typedef pystencils::PSMPackInfo PackInfo_T;
+
+////////////////
+// PARAMETERS //
+////////////////
+
+struct Setup
+{
+   uint_t checkFrequency;
+   real_t visc;
+   real_t tau;
+   real_t radius;
+   uint_t length;
+   real_t chi;
+   real_t extForce;
+   real_t analyticalDrag;
+};
+
+template< typename ParticleAccessor_T >
+class DragForceEvaluator
+{
+ public:
+   DragForceEvaluator(SweepTimeloop* timeloop, Setup* setup, const shared_ptr< StructuredBlockStorage >& blocks,
+                      const BlockDataID& velocityFieldID, const shared_ptr< ParticleAccessor_T >& ac,
+                      walberla::id_t sphereID)
+      : timeloop_(timeloop), setup_(setup), blocks_(blocks), velocityFieldID_(velocityFieldID), ac_(ac),
+        sphereID_(sphereID), normalizedDragOld_(0.0), normalizedDragNew_(0.0)
+   {
+      // calculate the analytical drag force value based on the series expansion of chi
+      // see also Sangani - Slow flow through a periodic array of spheres, IJMF 1982. Eq. 60 and Table 1
+      real_t analyticalDrag = real_c(0);
+      real_t tempChiPowS    = real_c(1);
+
+      // coefficients to calculate the drag in a series expansion
+      real_t dragCoefficients[31] = { real_c(1.000000),  real_c(1.418649),  real_c(2.012564),   real_c(2.331523),
+                                      real_c(2.564809),  real_c(2.584787),  real_c(2.873609),   real_c(3.340163),
+                                      real_c(3.536763),  real_c(3.504092),  real_c(3.253622),   real_c(2.689757),
+                                      real_c(2.037769),  real_c(1.809341),  real_c(1.877347),   real_c(1.534685),
+                                      real_c(0.9034708), real_c(0.2857896), real_c(-0.5512626), real_c(-1.278724),
+                                      real_c(1.013350),  real_c(5.492491),  real_c(4.615388),   real_c(-0.5736023),
+                                      real_c(-2.865924), real_c(-4.709215), real_c(-6.870076),  real_c(0.1455304),
+                                      real_c(12.51891),  real_c(9.742811),  real_c(-5.566269) };
+
+      for (uint_t s = 0; s <= uint_t(30); ++s)
+      {
+         analyticalDrag += dragCoefficients[s] * tempChiPowS;
+         tempChiPowS *= setup->chi;
+      }
+      setup_->analyticalDrag = analyticalDrag;
+   }
+
+   // evaluate the acting drag force
+   void operator()()
+   {
+      const uint_t timestep(timeloop_->getCurrentTimeStep() + 1);
+
+      if (timestep % setup_->checkFrequency != 0) return;
+
+      // get force in x-direction acting on the sphere
+      real_t forceX = computeDragForce();
+      // get average volumetric flowrate in the domain
+      real_t uBar = computeAverageVel();
+
+      // f_total = f_drag + f_buoyancy
+      real_t totalForce =
+         forceX + real_c(4.0 / 3.0) * math::pi * setup_->radius * setup_->radius * setup_->radius * setup_->extForce;
+
+      real_t normalizedDragForce = totalForce / real_c(6.0 * math::pi * setup_->visc * setup_->radius * uBar);
+
+      // update drag force values
+      normalizedDragOld_ = normalizedDragNew_;
+      normalizedDragNew_ = normalizedDragForce;
+   }
+
+   // return the relative temporal change in the normalized drag
+   real_t getDragForceDiff() const { return std::fabs((normalizedDragNew_ - normalizedDragOld_) / normalizedDragNew_); }
+
+   // return the drag force
+   real_t getDragForce() const { return normalizedDragNew_; }
+
+   void logResultToFile(const std::string& filename) const
+   {
+      // write to file if desired
+      // format: length tau viscosity simulatedDrag analyticalDrag\n
+      WALBERLA_ROOT_SECTION()
+      {
+         std::ofstream file;
+         file.open(filename.c_str(), std::ofstream::app);
+         file.precision(8);
+         file << setup_->length << " " << setup_->tau << " " << setup_->visc << " " << normalizedDragNew_ << " "
+              << setup_->analyticalDrag << "\n";
+         file.close();
+      }
+   }
+
+ private:
+   // obtain the drag force acting on the sphere by summing up all the process local parts of fX
+   real_t computeDragForce()
+   {
+      size_t idx = ac_->uidToIdx(sphereID_);
+      WALBERLA_ASSERT_UNEQUAL(idx, ac_->getInvalidIdx(), "Index of particle is invalid!");
+      real_t force = real_t(0);
+      if (idx != ac_->getInvalidIdx()) { force = ac_->getHydrodynamicForce(idx)[0]; }
+
+      WALBERLA_MPI_SECTION() { mpi::allReduceInplace(force, mpi::SUM); }
+
+      return force;
+   }
+
+   // calculate the average velocity in forcing direction (here: x) inside the domain (assuming dx=1)
+   real_t computeAverageVel()
+   {
+      auto velocity_sum = real_t(0);
+      // iterate all blocks stored locally on this process
+      for (auto blockIt = blocks_->begin(); blockIt != blocks_->end(); ++blockIt)
+      {
+         // retrieve the pdf field and the flag field from the block
+         VelocityField_T* velocityField = blockIt->getData< VelocityField_T >(velocityFieldID_);
+
+         // get the flag that marks a cell as being fluid
+
+         auto xyzField = velocityField->xyzSize();
+         for (auto cell : xyzField)
+         {
+            // TODO: fix velocity computation by using getPSMMacroscopicVelocity
+            velocity_sum += velocityField->get(cell, 0);
+         }
+      }
+
+      WALBERLA_MPI_SECTION() { mpi::allReduceInplace(velocity_sum, mpi::SUM); }
+
+      return velocity_sum / real_c(setup_->length * setup_->length * setup_->length);
+   }
+
+   SweepTimeloop* timeloop_;
+
+   Setup* setup_;
+
+   shared_ptr< StructuredBlockStorage > blocks_;
+   const BlockDataID velocityFieldID_;
+
+   shared_ptr< ParticleAccessor_T > ac_;
+   const walberla::id_t sphereID_;
+
+   // drag coefficient
+   real_t normalizedDragOld_;
+   real_t normalizedDragNew_;
+};
+
+//////////
+// MAIN //
+//////////
+
+//*******************************************************************************************************************
+/*!\brief Testcase that checks the drag force acting on a fixed sphere in the center of a cubic domain in Stokes flow
+ *
+ * The drag force for this problem (often denoted as Simple Cubic setup) is given by a semi-analytical series expansion.
+ * The cubic domain is periodic in all directions, making it a physically infinite periodic array of spheres.
+   \verbatim
+           _______________
+        ->|               |->
+        ->|      ___      |->
+      W ->|     /   \     |-> E
+      E ->|    |  x  |    |-> A
+      S ->|     \___/     |-> S
+      T ->|               |-> T
+        ->|_______________|->
+
+   \endverbatim
+ *
+ * The collision model used for the LBM is TRT with a relaxation parameter tau=1.5 and the magic parameter 3/16.
+ * The Stokes approximation of the equilibrium PDFs is used.
+ * The flow is driven by a constant particle force of 1e-5.
+ * The domain is 32x32x32, and the sphere has a diameter of 16 cells ( chi * domainlength )
+ * The simulation is run until the relative change in the dragforce between 100 time steps is less than 1e-5.
+ * The RPD is not used since the sphere is kept fixed and the force is explicitly reset after each time step.
+ * To avoid periodicity constrain problems, the sphere is set as global.
+ *
+ */
+//*******************************************************************************************************************
+
+int main(int argc, char** argv)
+{
+   debug::enterTestMode();
+
+   mpi::Environment env(argc, argv);
+
+   logging::Logging::instance()->setLogLevel(logging::Logging::INFO);
+
+   auto processes = MPIManager::instance()->numProcesses();
+
+   if (processes != 1 && processes != 2 && processes != 4 && processes != 8)
+   {
+      std::cerr << "Number of processes must be equal to either 1, 2, 4, or 8!" << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   ///////////////////
+   // Customization //
+   ///////////////////
+
+   bool shortrun       = false;
+   bool funcTest       = false;
+   bool logging        = false;
+   uint_t vtkFrequency = uint_c(0);
+   real_t tau          = real_c(1.5);
+   uint_t length       = uint_c(32);
+
+   for (int i = 1; i < argc; ++i)
+   {
+      if (std::strcmp(argv[i], "--shortrun") == 0)
+      {
+         shortrun = true;
+         continue;
+      }
+      if (std::strcmp(argv[i], "--funcTest") == 0)
+      {
+         funcTest = true;
+         continue;
+      }
+      if (std::strcmp(argv[i], "--logging") == 0)
+      {
+         logging = true;
+         continue;
+      }
+      if (std::strcmp(argv[i], "--tau") == 0)
+      {
+         tau = real_c(std::atof(argv[++i]));
+         continue;
+      }
+      if (std::strcmp(argv[i], "--length") == 0)
+      {
+         length = uint_c(std::atof(argv[++i]));
+         continue;
+      }
+      if (std::strcmp(argv[i], "--vtkFrequency") == 0)
+      {
+         vtkFrequency = uint_c(std::atof(argv[++i]));
+         continue;
+      }
+      WALBERLA_ABORT("Unrecognized command line argument found: " << argv[i]);
+   }
+
+   ///////////////////////////
+   // SIMULATION PROPERTIES //
+   ///////////////////////////
+
+   Setup setup;
+
+   setup.length                  = length;       // length of the cubic domain in lattice cells
+   setup.chi                     = real_c(0.5);  // porosity parameter: diameter / length
+   setup.tau                     = tau;          // relaxation time
+   setup.extForce                = real_c(1e-7); // constant particle force in lattice units
+   setup.checkFrequency          = uint_t(100);  // evaluate the drag force only every checkFrequency time steps
+   setup.radius                  = real_c(0.5) * setup.chi * real_c(setup.length); // sphere radius
+   setup.visc                    = (setup.tau - real_c(0.5)) / real_c(3);          // viscosity in lattice units
+   const real_t omega            = real_c(1) / setup.tau;                          // relaxation rate
+   const real_t dx               = real_c(1);                                      // lattice dx
+   const real_t convergenceLimit = real_c(1e-7); // tolerance for relative change in drag force
+   const uint_t timesteps =
+      funcTest ? 1 : (shortrun ? uint_c(150) : uint_c(50000)); // maximum number of time steps for the whole simulation
+
+   ///////////////////////////
+   // BLOCK STRUCTURE SETUP //
+   ///////////////////////////
+
+   const uint_t XBlocks = (processes >= 2) ? uint_t(2) : uint_t(1);
+   const uint_t YBlocks = (processes >= 4) ? uint_t(2) : uint_t(1);
+   const uint_t ZBlocks = (processes == 8) ? uint_t(2) : uint_t(1);
+   const uint_t XCells  = setup.length / XBlocks;
+   const uint_t YCells  = setup.length / YBlocks;
+   const uint_t ZCells  = setup.length / ZBlocks;
+
+   // create fully periodic domain
+   auto blocks = blockforest::createUniformBlockGrid(XBlocks, YBlocks, ZBlocks, XCells, YCells, ZCells, dx, true, true,
+                                                     true, true);
+
+   /////////
+   // RPD //
+   /////////
+
+   mesa_pd::domain::BlockForestDomain domain(blocks->getBlockForestPointer());
+
+   // init data structures
+   auto ps                  = std::make_shared< mesa_pd::data::ParticleStorage >(1);
+   auto ss                  = std::make_shared< mesa_pd::data::ShapeStorage >();
+   using ParticleAccessor_T = mesa_pd::data::ParticleAccessorWithShape;
+   auto accessor            = make_shared< ParticleAccessor_T >(ps, ss);
+   auto sphereShape         = ss->create< mesa_pd::data::Sphere >(setup.radius);
+
+   //////////////////
+   // RPD COUPLING //
+   //////////////////
+
+   // connect to pe
+   const real_t overlap = real_t(1.5) * dx;
+
+   if (setup.radius > real_c(setup.length) * real_t(0.5) - overlap)
+   {
+      std::cerr << "Periodic sphere is too large and would lead to incorrect mapping!" << std::endl;
+      // solution: create the periodic copies explicitly
+      return EXIT_FAILURE;
+   }
+
+   // create the sphere in the middle of the domain
+   // it is global and thus present on all processes
+   Vector3< real_t > position(real_c(setup.length) * real_c(0.5));
+   walberla::id_t sphereID;
+   {
+      mesa_pd::data::Particle&& p = *ps->create(true);
+      p.setPosition(position);
+      p.setInteractionRadius(setup.radius);
+      p.setOwner(mpi::MPIManager::instance()->rank());
+      p.setShapeID(sphereShape);
+      sphereID = p.getUid();
+   }
+
+   ///////////////////////
+   // ADD DATA TO BLOCKS //
+   ////////////////////////
+
+   // add fields
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   BlockDataID pdfFieldID =
+      field::addToStorage< PdfField_T >(blocks, "pdf field (fzyx)", real_c(std::nan("")), field::fzyx);
+   BlockDataID BFieldID         = field::addToStorage< BField_T >(blocks, "B", real_t(0), field::fzyx);
+   BlockDataID pdfFieldCPUGPUID = gpu::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldID, "pdf field GPU");
+#else
+   BlockDataID pdfFieldCPUGPUID =
+      field::addToStorage< PdfField_T >(blocks, "pdf field CPU", real_c(std::nan("")), field::fzyx);
+#endif
+
+   BlockDataID densityFieldID = field::addToStorage< DensityField_T >(blocks, "Density", real_t(0), field::fzyx);
+   BlockDataID velFieldID     = field::addToStorage< VelocityField_T >(blocks, "Velocity", real_t(0), field::fzyx);
+
+   ///////////////
+   // TIME LOOP //
+   ///////////////
+
+   // create the timeloop
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+
+   // setup of the LBM communication for synchronizing the pdf field between neighboring blocks
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   gpu::communication::UniformGPUScheme< Stencil_T > com(blocks, 0, false);
+#else
+   walberla::blockforest::communication::UniformBufferedScheme< Stencil_T > com(blocks);
+#endif
+   com.addPackInfo(make_shared< PackInfo_T >(pdfFieldCPUGPUID));
+   auto communication = std::function< void() >([&]() { com.communicate(); });
+
+   // add particle and volume fraction data structures
+   ParticleAndVolumeFractionSoA_T< 1 > particleAndVolumeFractionSoA(blocks, omega);
+
+   // map particles and calculate solid volume fraction initially
+   PSMSweepCollection psmSweepCollection(blocks, accessor, lbm_mesapd_coupling::GlobalParticlesSelector(),
+                                         particleAndVolumeFractionSoA, Vector3(8));
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      psmSweepCollection.particleMappingSweep(&(*blockIt));
+   }
+
+   pystencils::InitializeDomainForPSM pdfSetter(
+      particleAndVolumeFractionSoA.BsFieldID, particleAndVolumeFractionSoA.BFieldID,
+      particleAndVolumeFractionSoA.particleVelocitiesFieldID, pdfFieldCPUGPUID, real_t(0), real_t(0), real_t(0),
+      real_t(1.0), real_t(0), real_t(0), real_t(0));
+
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      // pdfSetter requires particle velocities at cell centers
+      psmSweepCollection.setParticleVelocitiesSweep(&(*blockIt));
+      pdfSetter(&(*blockIt));
+   }
+
+   // since external forcing is applied, the evaluation of the velocity has to be carried out directly after the
+   // streaming step however, the default sweep is a  stream - collide step, i.e. after the sweep, the velocity
+   // evaluation is not correct solution: split the sweep explicitly into collide and stream
+   pystencils::PSMSweep PSMSweep(particleAndVolumeFractionSoA.BsFieldID, particleAndVolumeFractionSoA.BFieldID,
+                                 particleAndVolumeFractionSoA.particleForcesFieldID,
+                                 particleAndVolumeFractionSoA.particleVelocitiesFieldID, pdfFieldCPUGPUID,
+                                 setup.extForce, real_t(0.0), real_t(0.0), omega);
+
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   pystencils::PSM_MacroGetter getterSweep(BFieldID, densityFieldID, pdfFieldID, velFieldID, setup.extForce,
+                                           real_t(0.0), real_t(0.0));
+#else
+   pystencils::PSM_MacroGetter getterSweep(particleAndVolumeFractionSoA.BFieldID, densityFieldID, pdfFieldCPUGPUID,
+                                           velFieldID, setup.extForce, real_t(0.0), real_t(0.0));
+#endif
+
+   // add LBM communication function and streaming & force evaluation
+   using DragForceEval_T = DragForceEvaluator< ParticleAccessor_T >;
+   auto forceEval        = make_shared< DragForceEval_T >(&timeloop, &setup, blocks, velFieldID, accessor, sphereID);
+   timeloop.add() << BeforeFunction(communication, "LBM Communication")
+                  << Sweep(deviceSyncWrapper(psmSweepCollection.setParticleVelocitiesSweep), "Set particle velocities");
+   timeloop.add() << Sweep(deviceSyncWrapper(PSMSweep), "cell-wise PSM sweep");
+   timeloop.add() << Sweep(deviceSyncWrapper(psmSweepCollection.reduceParticleForcesSweep), "Reduce particle forces");
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   timeloop.add() << Sweep(gpu::fieldCpyFunctor< PdfField_T, gpu::GPUField< real_t > >(pdfFieldID, pdfFieldCPUGPUID),
+                           "copy pdf from GPU to CPU");
+   timeloop.add() << Sweep(
+      gpu::fieldCpyFunctor< BField_T, gpu::GPUField< real_t > >(BFieldID, particleAndVolumeFractionSoA.BFieldID),
+      "copy B field from GPU to CPU");
+#endif
+   timeloop.add() << Sweep(getterSweep, "compute velocity")
+                  << AfterFunction(SharedFunctor< DragForceEval_T >(forceEval), "drag force evaluation");
+
+   // resetting force
+   timeloop.addFuncAfterTimeStep(
+      [ps, accessor]() {
+         ps->forEachParticle(false, mesa_pd::kernel::SelectAll(), *accessor,
+                             lbm_mesapd_coupling::ResetHydrodynamicForceTorqueKernel(), *accessor);
+      },
+      "reset force on sphere");
+
+   timeloop.addFuncAfterTimeStep(RemainingTimeLogger(timeloop.getNrOfTimeSteps()), "Remaining Time Logger");
+
+   if (vtkFrequency > 0)
+   {
+      const std::string path = "vtk_out/dragForceSphereCPUGPU";
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "psm_velocity_fieldCPUGPU", vtkFrequency, 0, false, path,
+                                                      "simulation_step", false, true, true, false, 0);
+
+      vtkOutput->addBeforeFunction([&]() {
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+         gpu::fieldCpy< PdfField_T, gpu::GPUField< real_t > >(blocks, pdfFieldID, pdfFieldCPUGPUID);
+         gpu::fieldCpy< BField_T, gpu::GPUField< real_t > >(blocks, BFieldID, particleAndVolumeFractionSoA.BFieldID);
+#endif
+         for (auto& block : *blocks)
+            getterSweep(&block);
+      });
+      vtkOutput->addCellDataWriter(make_shared< field::VTKWriter< VelocityField_T > >(velFieldID, "Velocity"));
+
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+
+   ////////////////////////
+   // EXECUTE SIMULATION //
+   ////////////////////////
+
+   WcTimingPool timeloopTiming;
+
+   // time loop
+   for (uint_t i = 0; i < timesteps; ++i)
+   {
+      // perform a single simulation step
+      timeloop.singleStep(timeloopTiming);
+
+      // check if the relative change in the normalized drag force is below the specified convergence criterion
+      if (i > setup.checkFrequency && forceEval->getDragForceDiff() < convergenceLimit)
+      {
+         // if simulation has converged, terminate simulation
+         break;
+      }
+   }
+
+   timeloopTiming.logResultOnRoot();
+
+   if (!funcTest && !shortrun)
+   {
+      // check the result
+      real_t relErr = std::fabs((setup.analyticalDrag - forceEval->getDragForce()) / setup.analyticalDrag);
+      if (logging)
+      {
+         WALBERLA_ROOT_SECTION()
+         {
+            std::cout << "Analytical drag: " << setup.analyticalDrag << "\n"
+                      << "Simulated drag: " << forceEval->getDragForce() << "\n"
+                      << "Relative error: " << relErr << "\n";
+         }
+         forceEval->logResultToFile("log_DragForceSphere.txt");
+      }
+      // the relative error has to be below 10%
+      WALBERLA_CHECK_LESS(relErr, real_c(0.1));
+   }
+
+   return 0;
+}
+
+} // namespace drag_force_sphere_psm
+
+int main(int argc, char** argv) { drag_force_sphere_psm::main(argc, argv); }
diff --git a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMapping.cpp b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMapping.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d71062655a4e4b4bffaa325bb040654d9e603900
--- /dev/null
+++ b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/ParticleAndVolumeFractionMapping.cpp
@@ -0,0 +1,311 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ParticleAndVolumeFractionMappingPSMCPUGPU.cpp
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/debug/TestSubsystem.h"
+
+#include "field/AddToStorage.h"
+
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+#   include "gpu/FieldCopy.h"
+#   include "gpu/GPUField.h"
+#endif
+
+#include "lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMSweepCollection.h"
+#include "lbm_mesapd_coupling/utility/ParticleSelector.h"
+
+#include "mesa_pd/data/ParticleAccessorWithShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/ShapeStorage.h"
+#include "mesa_pd/domain/BlockForestDomain.h"
+#include "mesa_pd/kernel/SemiImplicitEuler.h"
+#include "mesa_pd/mpi/SyncNextNeighbors.h"
+
+#include <memory>
+
+namespace particle_volume_fraction_check
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+using namespace walberla::lbm_mesapd_coupling::psm::gpu;
+
+//*******************************************************************************************************************
+/*!\brief Calculating the sum over all fraction values. This can be used as a sanity check since it has to be roughly
+ * equal to the volume of all particles.
+ *
+ */
+//*******************************************************************************************************************
+class FractionFieldSum
+{
+ public:
+   FractionFieldSum(const shared_ptr< StructuredBlockStorage >& blockStorage,
+                    const BlockDataID& nOverlappingParticlesFieldID, const BlockDataID& BsFieldID)
+      : blockStorage_(blockStorage), nOverlappingParticlesFieldID_(nOverlappingParticlesFieldID), BsFieldID_(BsFieldID)
+   {}
+
+   real_t operator()()
+   {
+      real_t sum = 0.0;
+
+      for (auto blockIt = blockStorage_->begin(); blockIt != blockStorage_->end(); ++blockIt)
+      {
+         auto nOverlappingParticlesField =
+            blockIt->getData< nOverlappingParticlesField_T >(nOverlappingParticlesFieldID_);
+         auto BsField = blockIt->getData< BsField_T >(BsFieldID_);
+
+         const cell_idx_t xSize = cell_idx_c(BsField->xSize());
+         const cell_idx_t ySize = cell_idx_c(BsField->ySize());
+         const cell_idx_t zSize = cell_idx_c(BsField->zSize());
+
+         for (cell_idx_t z = 0; z < zSize; ++z)
+         {
+            for (cell_idx_t y = 0; y < ySize; ++y)
+            {
+               for (cell_idx_t x = 0; x < xSize; ++x)
+               {
+                  for (uint_t n = 0; n < nOverlappingParticlesField->get(x, y, z); ++n)
+                  {
+                     sum += BsField->get(x, y, z, n);
+                  }
+               }
+            }
+         }
+      }
+
+      WALBERLA_MPI_SECTION() { mpi::allReduceInplace(sum, mpi::SUM); }
+
+      return sum;
+   }
+
+ private:
+   shared_ptr< StructuredBlockStorage > blockStorage_;
+   BlockDataID nOverlappingParticlesFieldID_;
+   BlockDataID BsFieldID_;
+};
+
+////////////////
+// Parameters //
+////////////////
+
+struct Setup
+{
+   // domain size (in lattice cells) in x, y and z direction
+   uint_t xlength;
+   uint_t ylength;
+   uint_t zlength;
+
+   // number of block in x, y and z, direction
+   Vector3< uint_t > nBlocks;
+
+   // cells per block in x, y and z direction
+   Vector3< uint_t > cellsPerBlock;
+
+   real_t sphereDiam;
+
+   uint_t timesteps;
+};
+
+//////////
+// MAIN //
+//////////
+
+//*******************************************************************************************************************
+/*!\brief Testcase that checks if ParticleAndVolumeFractionMapping.h works as intended
+ *
+ * A sphere particle is placed inside the domain and is moving with a constant velocity. The overlap fraction is
+ * computed for all cells in each time step. If the mapping is correct, the sum over all fractions should be roughly
+ * equivalent to the volume of the sphere.
+ *
+ */
+//*******************************************************************************************************************
+
+int main(int argc, char** argv)
+{
+   debug::enterTestMode();
+
+   mpi::Environment env(argc, argv);
+
+   logging::Logging::instance()->setLogLevel(logging::Logging::INFO);
+
+   auto processes = MPIManager::instance()->numProcesses();
+
+   if (processes != 27)
+   {
+      std::cerr << "Number of processes must be 27!" << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   ///////////////////////////
+   // SIMULATION PROPERTIES //
+   ///////////////////////////
+
+   Setup setup;
+
+   setup.sphereDiam = real_c(12);
+   setup.zlength    = uint_c(4 * setup.sphereDiam);
+   setup.xlength    = setup.zlength;
+   setup.ylength    = setup.zlength;
+
+   const real_t sphereRadius = real_c(0.5) * setup.sphereDiam;
+   const real_t dx           = real_c(1);
+
+   setup.timesteps = 100;
+
+   ///////////////////////////
+   // BLOCK STRUCTURE SETUP //
+   ///////////////////////////
+
+   setup.nBlocks[0]       = uint_c(3);
+   setup.nBlocks[1]       = uint_c(3);
+   setup.nBlocks[2]       = uint_c(3);
+   setup.cellsPerBlock[0] = setup.xlength / setup.nBlocks[0];
+   setup.cellsPerBlock[1] = setup.ylength / setup.nBlocks[1];
+   setup.cellsPerBlock[2] = setup.zlength / setup.nBlocks[2];
+
+   auto blocks =
+      blockforest::createUniformBlockGrid(setup.nBlocks[0], setup.nBlocks[1], setup.nBlocks[2], setup.cellsPerBlock[0],
+                                          setup.cellsPerBlock[1], setup.cellsPerBlock[2], dx, true, true, true, true);
+
+   ////////////
+   // MesaPD //
+   ////////////
+
+   auto mesapdDomain        = std::make_shared< mesa_pd::domain::BlockForestDomain >(blocks->getBlockForestPointer());
+   auto ps                  = std::make_shared< mesa_pd::data::ParticleStorage >(1);
+   auto ss                  = std::make_shared< mesa_pd::data::ShapeStorage >();
+   using ParticleAccessor_T = mesa_pd::data::ParticleAccessorWithShape;
+   auto accessor            = walberla::make_shared< ParticleAccessor_T >(ps, ss);
+
+   // set up synchronization
+   std::function< void(void) > syncCall = [&]() {
+      mesa_pd::mpi::SyncNextNeighbors syncNextNeighborFunc;
+      syncNextNeighborFunc(*ps, *mesapdDomain);
+   };
+
+   // add the sphere in the center of the domain
+   Vector3< real_t > position(real_c(setup.xlength) * real_c(0.5), real_c(setup.ylength) * real_c(0.5),
+                              real_c(setup.zlength) * real_c(0.5));
+   Vector3< real_t > velocity(real_c(0.1), real_c(0.1), real_c(0.1));
+   auto sphereShape = ss->create< mesa_pd::data::Sphere >(sphereRadius);
+
+   if (mesapdDomain->isContainedInProcessSubdomain(uint_c(walberla::mpi::MPIManager::instance()->rank()), position))
+   {
+      auto sphereParticle = ps->create();
+
+      sphereParticle->setShapeID(sphereShape);
+      sphereParticle->setType(0);
+      sphereParticle->setPosition(position);
+      sphereParticle->setLinearVelocity(velocity);
+      sphereParticle->setOwner(walberla::MPIManager::instance()->rank());
+      sphereParticle->setInteractionRadius(sphereRadius);
+   }
+
+   Vector3< real_t > position2(real_c(0.0), real_c(0.0), real_c(0.0));
+   Vector3< real_t > velocity2(real_c(0.1), real_c(0.1), real_c(0.1));
+
+   if (mesapdDomain->isContainedInProcessSubdomain(uint_c(walberla::mpi::MPIManager::instance()->rank()), position2))
+   {
+      auto sphereParticle = ps->create();
+
+      sphereParticle->setShapeID(sphereShape);
+      sphereParticle->setType(0);
+      sphereParticle->setPosition(position2);
+      sphereParticle->setLinearVelocity(velocity2);
+      sphereParticle->setOwner(walberla::MPIManager::instance()->rank());
+      sphereParticle->setInteractionRadius(sphereRadius);
+   }
+
+   syncCall();
+
+   ///////////////////////
+   // ADD DATA TO BLOCKS //
+   ////////////////////////
+
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   // add particle and volume fraction fields (needed for the PSM)
+   BlockDataID nOverlappingParticlesFieldID = field::addToStorage< nOverlappingParticlesField_T >(
+      blocks, "number of overlapping particles field CPU", 0, field::fzyx, 1);
+   BlockDataID BsFieldID = field::addToStorage< BsField_T >(blocks, "Bs field CPU", 0, field::fzyx, 1);
+#endif
+
+   // dummy value for omega since it is not use because Weighting_T == 1
+   real_t omega = real_t(42.0);
+   ParticleAndVolumeFractionSoA_T< 1 > particleAndVolumeFractionSoA(blocks, omega);
+
+   // calculate fraction
+   PSMSweepCollection psmSweepCollection(blocks, accessor, lbm_mesapd_coupling::RegularParticlesSelector(),
+                                         particleAndVolumeFractionSoA, Vector3(16));
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      psmSweepCollection.particleMappingSweep(&(*blockIt));
+   }
+
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   FractionFieldSum fractionFieldSum(blocks, nOverlappingParticlesFieldID, BsFieldID);
+#else
+   FractionFieldSum fractionFieldSum(blocks, particleAndVolumeFractionSoA.nOverlappingParticlesFieldID,
+                                     particleAndVolumeFractionSoA.BsFieldID);
+#endif
+   auto selector = mesa_pd::kernel::SelectMaster();
+   mesa_pd::kernel::SemiImplicitEuler particleIntegration(1.0);
+
+   for (uint_t i = 0; i < setup.timesteps; ++i)
+   {
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+      // copy data back to perform the check on CPU
+      for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+      {
+         gpu::fieldCpySweepFunction< nOverlappingParticlesField_T, nOverlappingParticlesFieldGPU_T >(
+            nOverlappingParticlesFieldID, particleAndVolumeFractionSoA.nOverlappingParticlesFieldID, &(*blockIt));
+         gpu::fieldCpySweepFunction< BsField_T, BsFieldGPU_T >(BsFieldID, particleAndVolumeFractionSoA.BsFieldID,
+                                                               &(*blockIt));
+      }
+#endif
+
+      // check that the sum over all fractions is roughly the volume of the sphere
+      real_t sum = fractionFieldSum();
+      WALBERLA_CHECK_LESS(std::fabs(4.0 / 3.0 * math::pi * sphereRadius * sphereRadius * sphereRadius * 2 - sum),
+                          real_c(5));
+
+      // update position
+      ps->forEachParticle(false, selector, *accessor, particleIntegration, *accessor);
+      syncCall();
+
+      // map particles into field
+      for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+      {
+         psmSweepCollection.particleMappingSweep(&(*blockIt));
+      }
+   }
+
+   return EXIT_SUCCESS;
+}
+
+} // namespace particle_volume_fraction_check
+
+int main(int argc, char** argv) { particle_volume_fraction_check::main(argc, argv); }
diff --git a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/SettlingSpherePSM.cpp b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/SettlingSpherePSM.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bbcf1a2392322e9e4f45ffeb316240ba3bc0abe4
--- /dev/null
+++ b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/SettlingSpherePSM.cpp
@@ -0,0 +1,871 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file SettlingSpherePSMGPU.cpp
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//! \brief Modification of momentum_exchange_method/SettlingSphere.cpp
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "core/Environment.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/all.h"
+#include "core/math/all.h"
+#include "core/timing/RemainingTimeLogger.h"
+
+#include "field/AddToStorage.h"
+#include "field/vtk/all.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/DeviceSelectMPI.h"
+#include "gpu/communication/UniformGPUScheme.h"
+
+#include "lbm/field/AddToStorage.h"
+#include "lbm/vtk/all.h"
+
+#include "lbm_mesapd_coupling/DataTypesCodegen.h"
+#include "lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMSweepCollection.h"
+#include "lbm_mesapd_coupling/utility/AddForceOnParticlesKernel.h"
+#include "lbm_mesapd_coupling/utility/AddHydrodynamicInteractionKernel.h"
+#include "lbm_mesapd_coupling/utility/AverageHydrodynamicForceTorqueKernel.h"
+#include "lbm_mesapd_coupling/utility/InitializeHydrodynamicForceTorqueForAveragingKernel.h"
+#include "lbm_mesapd_coupling/utility/LubricationCorrectionKernel.h"
+#include "lbm_mesapd_coupling/utility/ParticleSelector.h"
+#include "lbm_mesapd_coupling/utility/ResetHydrodynamicForceTorqueKernel.h"
+
+#include "mesa_pd/collision_detection/AnalyticContactDetection.h"
+#include "mesa_pd/data/DataTypes.h"
+#include "mesa_pd/data/ParticleAccessorWithShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/ShapeStorage.h"
+#include "mesa_pd/data/shape/HalfSpace.h"
+#include "mesa_pd/data/shape/Sphere.h"
+#include "mesa_pd/domain/BlockForestDomain.h"
+#include "mesa_pd/kernel/DoubleCast.h"
+#include "mesa_pd/kernel/ExplicitEuler.h"
+#include "mesa_pd/kernel/ParticleSelector.h"
+#include "mesa_pd/kernel/SpringDashpot.h"
+#include "mesa_pd/kernel/VelocityVerlet.h"
+#include "mesa_pd/mpi/ContactFilter.h"
+#include "mesa_pd/mpi/ReduceProperty.h"
+#include "mesa_pd/mpi/SyncNextNeighbors.h"
+#include "mesa_pd/mpi/notifications/ForceTorqueNotification.h"
+#include "mesa_pd/mpi/notifications/HydrodynamicForceTorqueNotification.h"
+#include "mesa_pd/vtk/ParticleVtkOutput.h"
+
+#include "vtk/all.h"
+
+#include <functional>
+
+#include "InitializeDomainForPSM.h"
+#include "PSMPackInfo.h"
+#include "PSMSweep.h"
+#include "PSM_InfoHeader.h"
+#include "PSM_MacroGetter.h"
+#include "PSM_NoSlip.h"
+
+namespace settling_sphere
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+using walberla::uint_t;
+using namespace lbm_mesapd_coupling::psm::gpu;
+
+using flag_t      = walberla::uint8_t;
+using FlagField_T = FlagField< flag_t >;
+
+typedef pystencils::PSMPackInfo PackInfo_T;
+
+///////////
+// FLAGS //
+///////////
+
+const FlagUID Fluid_Flag("Fluid");
+const FlagUID NoSlip_Flag("NoSlip");
+
+//*******************************************************************************************************************
+/*!\brief Evaluating the position and velocity of the sphere
+ *
+ */
+//*******************************************************************************************************************
+template< typename ParticleAccessor_T >
+class SpherePropertyLogger
+{
+ public:
+   SpherePropertyLogger(const shared_ptr< ParticleAccessor_T >& ac, walberla::id_t sphereUid,
+                        const std::string& fileName, bool fileIO, real_t dx_SI, real_t dt_SI, real_t diameter,
+                        real_t gravitationalForceMag)
+      : ac_(ac), sphereUid_(sphereUid), fileName_(fileName), fileIO_(fileIO), dx_SI_(dx_SI), dt_SI_(dt_SI),
+        diameter_(diameter), gravitationalForceMag_(gravitationalForceMag), position_(real_t(0)),
+        maxVelocity_(real_t(0))
+   {
+      if (fileIO_)
+      {
+         WALBERLA_ROOT_SECTION()
+         {
+            std::ofstream file;
+            file.open(fileName_.c_str());
+            file << "#\t t\t posX\t posY\t gapZ\t velX\t velY\t velZ\n";
+            file.close();
+         }
+      }
+   }
+
+   void operator()(const uint_t timestep)
+   {
+      Vector3< real_t > pos(real_t(0));
+      Vector3< real_t > transVel(real_t(0));
+      Vector3< real_t > hydForce(real_t(0));
+
+      size_t idx = ac_->uidToIdx(sphereUid_);
+      if (idx != ac_->getInvalidIdx())
+      {
+         if (!mesa_pd::data::particle_flags::isSet(ac_->getFlags(idx), mesa_pd::data::particle_flags::GHOST))
+         {
+            pos      = ac_->getPosition(idx);
+            transVel = ac_->getLinearVelocity(idx);
+            hydForce = ac_->getHydrodynamicForce(idx);
+         }
+      }
+
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::allReduceInplace(pos[0], mpi::SUM);
+         mpi::allReduceInplace(pos[1], mpi::SUM);
+         mpi::allReduceInplace(pos[2], mpi::SUM);
+
+         mpi::allReduceInplace(transVel[0], mpi::SUM);
+         mpi::allReduceInplace(transVel[1], mpi::SUM);
+         mpi::allReduceInplace(transVel[2], mpi::SUM);
+
+         mpi::allReduceInplace(hydForce[0], mpi::SUM);
+         mpi::allReduceInplace(hydForce[1], mpi::SUM);
+         mpi::allReduceInplace(hydForce[2], mpi::SUM);
+      }
+
+      position_    = pos[2];
+      maxVelocity_ = std::max(maxVelocity_, -transVel[2]);
+
+      if (fileIO_) writeToFile(timestep, pos, transVel, hydForce);
+   }
+
+   real_t getPosition() const { return position_; }
+
+   real_t getMaxVelocity() const { return maxVelocity_; }
+
+ private:
+   void writeToFile(const uint_t timestep, const Vector3< real_t >& position, const Vector3< real_t >& velocity,
+                    const Vector3< real_t >& hydForce)
+   {
+      WALBERLA_ROOT_SECTION()
+      {
+         std::ofstream file;
+         file.open(fileName_.c_str(), std::ofstream::app);
+
+         auto scaledPosition     = position / diameter_;
+         auto velocity_SI        = velocity * dx_SI_ / dt_SI_;
+         auto normalizedHydForce = hydForce / gravitationalForceMag_;
+
+         file << timestep << "\t" << real_c(timestep) * dt_SI_ << "\t" << "\t" << scaledPosition[0] << "\t"
+              << scaledPosition[1] << "\t" << scaledPosition[2] - real_t(0.5) << "\t" << velocity_SI[0] << "\t"
+              << velocity_SI[1] << "\t" << velocity_SI[2] << "\t" << normalizedHydForce[0] << "\t"
+              << normalizedHydForce[1] << "\t" << normalizedHydForce[2] << "\n";
+         file.close();
+      }
+   }
+
+   shared_ptr< ParticleAccessor_T > ac_;
+   const walberla::id_t sphereUid_;
+   std::string fileName_;
+   bool fileIO_;
+   real_t dx_SI_, dt_SI_, diameter_, gravitationalForceMag_;
+
+   real_t position_;
+   real_t maxVelocity_;
+};
+
+void createPlaneSetup(const shared_ptr< mesa_pd::data::ParticleStorage >& ps,
+                      const shared_ptr< mesa_pd::data::ShapeStorage >& ss, const math::AABB& simulationDomain)
+{
+   // create bounding planes
+   mesa_pd::data::Particle p0 = *ps->create(true);
+   p0.setPosition(simulationDomain.minCorner());
+   p0.setInteractionRadius(std::numeric_limits< real_t >::infinity());
+   p0.setShapeID(ss->create< mesa_pd::data::HalfSpace >(Vector3< real_t >(0, 0, 1)));
+   p0.setOwner(mpi::MPIManager::instance()->rank());
+   p0.setType(0);
+   mesa_pd::data::particle_flags::set(p0.getFlagsRef(), mesa_pd::data::particle_flags::INFINITE);
+   mesa_pd::data::particle_flags::set(p0.getFlagsRef(), mesa_pd::data::particle_flags::FIXED);
+
+   mesa_pd::data::Particle p1 = *ps->create(true);
+   p1.setPosition(simulationDomain.maxCorner());
+   p1.setInteractionRadius(std::numeric_limits< real_t >::infinity());
+   p1.setShapeID(ss->create< mesa_pd::data::HalfSpace >(Vector3< real_t >(0, 0, -1)));
+   p1.setOwner(mpi::MPIManager::instance()->rank());
+   p1.setType(0);
+   mesa_pd::data::particle_flags::set(p1.getFlagsRef(), mesa_pd::data::particle_flags::INFINITE);
+   mesa_pd::data::particle_flags::set(p1.getFlagsRef(), mesa_pd::data::particle_flags::FIXED);
+
+   mesa_pd::data::Particle p2 = *ps->create(true);
+   p2.setPosition(simulationDomain.minCorner());
+   p2.setInteractionRadius(std::numeric_limits< real_t >::infinity());
+   p2.setShapeID(ss->create< mesa_pd::data::HalfSpace >(Vector3< real_t >(1, 0, 0)));
+   p2.setOwner(mpi::MPIManager::instance()->rank());
+   p2.setType(0);
+   mesa_pd::data::particle_flags::set(p2.getFlagsRef(), mesa_pd::data::particle_flags::INFINITE);
+   mesa_pd::data::particle_flags::set(p2.getFlagsRef(), mesa_pd::data::particle_flags::FIXED);
+
+   mesa_pd::data::Particle p3 = *ps->create(true);
+   p3.setPosition(simulationDomain.maxCorner());
+   p3.setInteractionRadius(std::numeric_limits< real_t >::infinity());
+   p3.setShapeID(ss->create< mesa_pd::data::HalfSpace >(Vector3< real_t >(-1, 0, 0)));
+   p3.setOwner(mpi::MPIManager::instance()->rank());
+   p3.setType(0);
+   mesa_pd::data::particle_flags::set(p3.getFlagsRef(), mesa_pd::data::particle_flags::INFINITE);
+   mesa_pd::data::particle_flags::set(p3.getFlagsRef(), mesa_pd::data::particle_flags::FIXED);
+
+   mesa_pd::data::Particle p4 = *ps->create(true);
+   p4.setPosition(simulationDomain.minCorner());
+   p4.setInteractionRadius(std::numeric_limits< real_t >::infinity());
+   p4.setShapeID(ss->create< mesa_pd::data::HalfSpace >(Vector3< real_t >(0, 1, 0)));
+   p4.setOwner(mpi::MPIManager::instance()->rank());
+   p4.setType(0);
+   mesa_pd::data::particle_flags::set(p4.getFlagsRef(), mesa_pd::data::particle_flags::INFINITE);
+   mesa_pd::data::particle_flags::set(p4.getFlagsRef(), mesa_pd::data::particle_flags::FIXED);
+
+   mesa_pd::data::Particle p5 = *ps->create(true);
+   p5.setPosition(simulationDomain.maxCorner());
+   p5.setInteractionRadius(std::numeric_limits< real_t >::infinity());
+   p5.setShapeID(ss->create< mesa_pd::data::HalfSpace >(Vector3< real_t >(0, -1, 0)));
+   p5.setOwner(mpi::MPIManager::instance()->rank());
+   p5.setType(0);
+   mesa_pd::data::particle_flags::set(p5.getFlagsRef(), mesa_pd::data::particle_flags::INFINITE);
+   mesa_pd::data::particle_flags::set(p5.getFlagsRef(), mesa_pd::data::particle_flags::FIXED);
+}
+
+//////////
+// MAIN //
+//////////
+
+//*******************************************************************************************************************
+/*!\brief Testcase that simulates the settling of a sphere inside a rectangular column filled with viscous fluid
+ *
+ * see: ten Cate, Nieuwstad, Derksen, Van den Akker - "Particle imaging velocimetry experiments and lattice-Boltzmann
+ * simulations on a single sphere settling under gravity" (2002), Physics of Fluids, doi: 10.1063/1.1512918
+ */
+//*******************************************************************************************************************
+
+int main(int argc, char** argv)
+{
+   debug::enterTestMode();
+
+   mpi::Environment env(argc, argv);
+
+   logging::Logging::instance()->setLogLevel(logging::Logging::INFO);
+
+   ///////////////////
+   // Customization //
+   ///////////////////
+
+   // simulation control
+   bool shortrun          = false;
+   bool funcTest          = false;
+   bool fileIO            = false;
+   uint_t vtkIOFreq       = 0;
+   std::string baseFolder = "vtk_out_SettlingSphere_CPU_GPU";
+
+   // physical setup
+   uint_t fluidType = 1;
+
+   // numerical parameters
+   uint_t numberOfCellsInHorizontalDirection = uint_t(135);
+   bool averageForceTorqueOverTwoTimeSteps   = true;
+   uint_t numRPDSubCycles                    = uint_t(1);
+   bool useVelocityVerlet                    = false;
+
+   for (int i = 1; i < argc; ++i)
+   {
+      if (std::strcmp(argv[i], "--shortrun") == 0)
+      {
+         shortrun = true;
+         continue;
+      }
+      if (std::strcmp(argv[i], "--funcTest") == 0)
+      {
+         funcTest = true;
+         continue;
+      }
+      if (std::strcmp(argv[i], "--fileIO") == 0)
+      {
+         fileIO = true;
+         continue;
+      }
+      if (std::strcmp(argv[i], "--vtkIOFreq") == 0)
+      {
+         vtkIOFreq = uint_c(std::atof(argv[++i]));
+         continue;
+      }
+      if (std::strcmp(argv[i], "--fluidType") == 0)
+      {
+         fluidType = uint_c(std::atof(argv[++i]));
+         continue;
+      }
+      if (std::strcmp(argv[i], "--numRPDSubCycles") == 0)
+      {
+         numRPDSubCycles = uint_c(std::atof(argv[++i]));
+         continue;
+      }
+      if (std::strcmp(argv[i], "--resolution") == 0)
+      {
+         numberOfCellsInHorizontalDirection = uint_c(std::atof(argv[++i]));
+         continue;
+      }
+      if (std::strcmp(argv[i], "--noForceAveraging") == 0)
+      {
+         averageForceTorqueOverTwoTimeSteps = false;
+         continue;
+      }
+      if (std::strcmp(argv[i], "--baseFolder") == 0)
+      {
+         baseFolder = argv[++i];
+         continue;
+      }
+      if (std::strcmp(argv[i], "--useVV") == 0)
+      {
+         useVelocityVerlet = true;
+         continue;
+      }
+      WALBERLA_ABORT("Unrecognized command line argument found: " << argv[i]);
+   }
+
+   if (funcTest) { walberla::logging::Logging::instance()->setLogLevel(logging::Logging::LogLevel::WARNING); }
+
+   if (fileIO)
+   {
+      // create base directory if it does not yet exist
+      filesystem::path tpath(baseFolder);
+      if (!filesystem::exists(tpath)) filesystem::create_directory(tpath);
+   }
+
+   //////////////////////////////////////
+   // SIMULATION PROPERTIES in SI units//
+   //////////////////////////////////////
+
+   // values are mainly taken from the reference paper
+   const real_t diameter_SI      = real_t(15e-3);
+   const real_t densitySphere_SI = real_t(1120);
+
+   real_t densityFluid_SI, dynamicViscosityFluid_SI;
+   real_t expectedSettlingVelocity_SI;
+   switch (fluidType)
+   {
+   case 1:
+      // Re_p around 1.5
+      densityFluid_SI             = real_t(970);
+      dynamicViscosityFluid_SI    = real_t(373e-3);
+      expectedSettlingVelocity_SI = real_t(0.035986);
+      break;
+   case 2:
+      // Re_p around 4.1
+      densityFluid_SI             = real_t(965);
+      dynamicViscosityFluid_SI    = real_t(212e-3);
+      expectedSettlingVelocity_SI = real_t(0.05718);
+      break;
+   case 3:
+      // Re_p around 11.6
+      densityFluid_SI             = real_t(962);
+      dynamicViscosityFluid_SI    = real_t(113e-3);
+      expectedSettlingVelocity_SI = real_t(0.087269);
+      break;
+   case 4:
+      // Re_p around 31.9
+      densityFluid_SI             = real_t(960);
+      dynamicViscosityFluid_SI    = real_t(58e-3);
+      expectedSettlingVelocity_SI = real_t(0.12224);
+      break;
+   default:
+      WALBERLA_ABORT("Only four different fluids are supported! Choose type between 1 and 4.");
+   }
+   const real_t kinematicViscosityFluid_SI = dynamicViscosityFluid_SI / densityFluid_SI;
+
+   const real_t gravitationalAcceleration_SI = real_t(9.81);
+   Vector3< real_t > domainSize_SI(real_t(100e-3), real_t(100e-3), real_t(160e-3));
+   // shift starting gap a bit upwards to match the reported (plotted) values
+   const real_t startingGapSize_SI = real_t(120e-3) + real_t(0.25) * diameter_SI;
+
+   WALBERLA_LOG_INFO_ON_ROOT("Setup (in SI units):");
+   WALBERLA_LOG_INFO_ON_ROOT(" - fluid type = " << fluidType);
+   WALBERLA_LOG_INFO_ON_ROOT(" - domain size = " << domainSize_SI);
+   WALBERLA_LOG_INFO_ON_ROOT(" - sphere: diameter = " << diameter_SI << ", density = " << densitySphere_SI
+                                                      << ", starting gap size = " << startingGapSize_SI);
+   WALBERLA_LOG_INFO_ON_ROOT(" - fluid: density = " << densityFluid_SI << ", dyn. visc = " << dynamicViscosityFluid_SI
+                                                    << ", kin. visc = " << kinematicViscosityFluid_SI);
+   WALBERLA_LOG_INFO_ON_ROOT(" - expected settling velocity = "
+                             << expectedSettlingVelocity_SI << " --> Re_p = "
+                             << expectedSettlingVelocity_SI * diameter_SI / kinematicViscosityFluid_SI);
+
+   //////////////////////////
+   // NUMERICAL PARAMETERS //
+   //////////////////////////
+
+   const real_t dx_SI = domainSize_SI[0] / real_c(numberOfCellsInHorizontalDirection);
+   const Vector3< uint_t > domainSize(uint_c(floor(domainSize_SI[0] / dx_SI + real_t(0.5))),
+                                      uint_c(floor(domainSize_SI[1] / dx_SI + real_t(0.5))),
+                                      uint_c(floor(domainSize_SI[2] / dx_SI + real_t(0.5))));
+   const real_t diameter     = diameter_SI / dx_SI;
+   const real_t sphereVolume = math::pi / real_t(6) * diameter * diameter * diameter;
+
+   const real_t expectedSettlingVelocity = real_t(0.01);
+   const real_t dt_SI                    = expectedSettlingVelocity / expectedSettlingVelocity_SI * dx_SI;
+
+   const real_t viscosity      = kinematicViscosityFluid_SI * dt_SI / (dx_SI * dx_SI);
+   const real_t relaxationTime = real_t(1) / lbm::collision_model::omegaFromViscosity(viscosity);
+
+   const real_t gravitationalAcceleration = gravitationalAcceleration_SI * dt_SI * dt_SI / dx_SI;
+
+   const real_t densityFluid  = real_t(1);
+   const real_t densitySphere = densityFluid * densitySphere_SI / densityFluid_SI;
+
+   const real_t dx = real_t(1);
+
+   const uint_t timesteps = funcTest ? 1 : (shortrun ? uint_t(200) : uint_t(250000));
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - dx_SI = " << dx_SI << ", dt_SI = " << dt_SI);
+   WALBERLA_LOG_INFO_ON_ROOT("Setup (in simulation, i.e. lattice, units):");
+   WALBERLA_LOG_INFO_ON_ROOT(" - domain size = " << domainSize);
+   WALBERLA_LOG_INFO_ON_ROOT(" - sphere: diameter = " << diameter << ", density = " << densitySphere);
+   WALBERLA_LOG_INFO_ON_ROOT(" - fluid: density = " << densityFluid << ", relaxation time (tau) = " << relaxationTime
+                                                    << ", kin. visc = " << viscosity);
+   WALBERLA_LOG_INFO_ON_ROOT(" - gravitational acceleration = " << gravitationalAcceleration);
+   WALBERLA_LOG_INFO_ON_ROOT(" - expected settling velocity = " << expectedSettlingVelocity << " --> Re_p = "
+                                                                << expectedSettlingVelocity * diameter / viscosity);
+   WALBERLA_LOG_INFO_ON_ROOT(" - integrator = " << (useVelocityVerlet ? "Velocity Verlet" : "Explicit Euler"));
+
+   if (vtkIOFreq > 0)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - writing vtk files to folder \"" << baseFolder << "\" with frequency " << vtkIOFreq);
+   }
+
+   ///////////////////////////
+   // BLOCK STRUCTURE SETUP //
+   ///////////////////////////
+
+   Vector3< uint_t > numberOfBlocksPerDirection(uint_t(1), uint_t(1), uint_t(MPIManager::instance()->numProcesses()));
+   Vector3< uint_t > cellsPerBlockPerDirection(domainSize[0] / numberOfBlocksPerDirection[0],
+                                               domainSize[1] / numberOfBlocksPerDirection[1],
+                                               domainSize[2] / numberOfBlocksPerDirection[2]);
+   WALBERLA_CHECK_EQUAL(
+      numberOfBlocksPerDirection[0] * numberOfBlocksPerDirection[1] * numberOfBlocksPerDirection[2],
+      uint_t(MPIManager::instance()->numProcesses()),
+      "When using GPUs, the number of blocks ("
+         << numberOfBlocksPerDirection[0] * numberOfBlocksPerDirection[1] * numberOfBlocksPerDirection[2]
+         << ") has to match the number of MPI processes (" << uint_t(MPIManager::instance()->numProcesses()) << ")");
+
+   for (uint_t i = 0; i < 3; ++i)
+   {
+      WALBERLA_CHECK_EQUAL(cellsPerBlockPerDirection[i] * numberOfBlocksPerDirection[i], domainSize[i],
+                           "Unmatching domain decomposition in direction " << i << "!");
+   }
+
+   auto blocks = blockforest::createUniformBlockGrid(numberOfBlocksPerDirection[0], numberOfBlocksPerDirection[1],
+                                                     numberOfBlocksPerDirection[2], cellsPerBlockPerDirection[0],
+                                                     cellsPerBlockPerDirection[1], cellsPerBlockPerDirection[2], dx, 0,
+                                                     false, false, false, false, false, // periodicity
+                                                     false);
+
+   WALBERLA_LOG_INFO_ON_ROOT("Domain decomposition:");
+   WALBERLA_LOG_INFO_ON_ROOT(" - blocks per direction = " << numberOfBlocksPerDirection);
+   WALBERLA_LOG_INFO_ON_ROOT(" - cells per block = " << cellsPerBlockPerDirection);
+
+   // write domain decomposition to file
+   if (vtkIOFreq > 0) { vtk::writeDomainDecomposition(blocks, "initial_domain_decomposition", baseFolder); }
+
+   //////////////////
+   // RPD COUPLING //
+   //////////////////
+
+   auto rpdDomain = std::make_shared< mesa_pd::domain::BlockForestDomain >(blocks->getBlockForestPointer());
+
+   // init data structures
+   auto ps                  = walberla::make_shared< mesa_pd::data::ParticleStorage >(1);
+   auto ss                  = walberla::make_shared< mesa_pd::data::ShapeStorage >();
+   using ParticleAccessor_T = mesa_pd::data::ParticleAccessorWithShape;
+   auto accessor            = walberla::make_shared< ParticleAccessor_T >(ps, ss);
+
+   // bounding planes
+   createPlaneSetup(ps, ss, blocks->getDomain());
+
+   // create sphere and store Uid
+   Vector3< real_t > initialPosition(real_t(0.5) * real_c(domainSize[0]), real_t(0.5) * real_c(domainSize[1]),
+                                     startingGapSize_SI / dx_SI + real_t(0.5) * diameter);
+   auto sphereShape = ss->create< mesa_pd::data::Sphere >(diameter * real_t(0.5));
+   ss->shapes[sphereShape]->updateMassAndInertia(densitySphere);
+
+   walberla::id_t sphereUid = 0;
+   if (rpdDomain->isContainedInProcessSubdomain(uint_c(mpi::MPIManager::instance()->rank()), initialPosition))
+   {
+      mesa_pd::data::Particle&& p = *ps->create();
+      p.setPosition(initialPosition);
+      p.setInteractionRadius(diameter * real_t(0.5));
+      p.setOwner(mpi::MPIManager::instance()->rank());
+      p.setShapeID(sphereShape);
+      sphereUid = p.getUid();
+   }
+   mpi::allReduceInplace(sphereUid, mpi::SUM);
+
+   ///////////////////////
+   // ADD DATA TO BLOCKS //
+   ////////////////////////
+
+   // add PDF field
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   BlockDataID pdfFieldID =
+      field::addToStorage< PdfField_T >(blocks, "pdf field (fzyx)", real_c(std::nan("")), field::fzyx);
+   BlockDataID BFieldID         = field::addToStorage< BField_T >(blocks, "B field CPU", 0, field::fzyx, 1);
+   BlockDataID BsFieldID        = field::addToStorage< BsField_T >(blocks, "Bs field CPU", 0, field::fzyx, 1);
+   BlockDataID pdfFieldCPUGPUID = gpu::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldID, "pdf field GPU");
+#else
+   BlockDataID pdfFieldCPUGPUID =
+      field::addToStorage< PdfField_T >(blocks, "pdf field CPU", real_c(std::nan("")), field::fzyx);
+#endif
+
+   BlockDataID densityFieldID = field::addToStorage< DensityField_T >(blocks, "Density", real_t(0), field::fzyx);
+   BlockDataID velFieldID     = field::addToStorage< VelocityField_T >(blocks, "Velocity", real_t(0), field::fzyx);
+
+   // add flag field
+   BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+
+   // set up RPD functionality
+   std::function< void(void) > syncCall = [ps, rpdDomain]() {
+      mesa_pd::mpi::SyncNextNeighbors syncNextNeighborFunc;
+      syncNextNeighborFunc(*ps, *rpdDomain);
+   };
+
+   syncCall();
+
+   mesa_pd::kernel::ExplicitEuler explEulerIntegrator(real_t(1) / real_t(numRPDSubCycles));
+   mesa_pd::kernel::VelocityVerletPreForceUpdate vvIntegratorPreForce(real_t(1) / real_t(numRPDSubCycles));
+   mesa_pd::kernel::VelocityVerletPostForceUpdate vvIntegratorPostForce(real_t(1) / real_t(numRPDSubCycles));
+
+   mesa_pd::kernel::SpringDashpot collisionResponse(1);
+   mesa_pd::mpi::ReduceProperty reduceProperty;
+
+   // set up coupling functionality
+   lbm_mesapd_coupling::RegularParticlesSelector sphereSelector;
+   Vector3< real_t > gravitationalForce(real_t(0), real_t(0),
+                                        -(densitySphere - densityFluid) * gravitationalAcceleration * sphereVolume);
+   lbm_mesapd_coupling::AddForceOnParticlesKernel addGravitationalForce(gravitationalForce);
+   lbm_mesapd_coupling::AddHydrodynamicInteractionKernel addHydrodynamicInteraction;
+   lbm_mesapd_coupling::ResetHydrodynamicForceTorqueKernel resetHydrodynamicForceTorque;
+   lbm_mesapd_coupling::AverageHydrodynamicForceTorqueKernel averageHydrodynamicForceTorque;
+   lbm_mesapd_coupling::LubricationCorrectionKernel lubricationCorrectionKernel(
+      viscosity, [](real_t r) { return real_t(0.0016) * r; });
+   lbm::PSM_NoSlip noSlip(blocks, pdfFieldCPUGPUID);
+
+   ///////////////
+   // TIME LOOP //
+   ///////////////
+
+   // map no-slip boundaries into the LBM simulation
+   std::string boundariesBlockString = " Boundaries"
+                                       "{"
+                                       "Border { direction T;    walldistance -1;  flag NoSlip; }"
+                                       "Border { direction B;    walldistance -1;  flag NoSlip; }"
+                                       "Border { direction N;    walldistance -1;  flag NoSlip; }"
+                                       "Border { direction S;    walldistance -1;  flag NoSlip; }"
+                                       "Border { direction W;    walldistance -1;  flag NoSlip; }"
+                                       "Border { direction E;    walldistance -1;  flag NoSlip; }"
+                                       "}";
+
+   WALBERLA_ROOT_SECTION()
+   {
+      std::ofstream boundariesFile("boundaries.prm");
+      boundariesFile << boundariesBlockString;
+      boundariesFile.close();
+   }
+   WALBERLA_MPI_BARRIER()
+
+   auto boundariesCfgFile = Config();
+   boundariesCfgFile.readParameterFile("boundaries.prm");
+   auto boundariesConfig = boundariesCfgFile.getBlock("Boundaries");
+   geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, Fluid_Flag);
+   noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, NoSlip_Flag, Fluid_Flag);
+
+   // add particle and volume fraction data structures
+   ParticleAndVolumeFractionSoA_T< 1 > particleAndVolumeFractionSoA(
+      blocks, lbm::collision_model::omegaFromViscosity(viscosity));
+   // map particles and calculate solid volume fraction initially
+   PSMSweepCollection psmSweepCollection(blocks, accessor, sphereSelector, particleAndVolumeFractionSoA, Vector3(27));
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      psmSweepCollection.particleMappingSweep(&(*blockIt));
+   }
+
+   pystencils::InitializeDomainForPSM pdfSetter(
+      particleAndVolumeFractionSoA.BsFieldID, particleAndVolumeFractionSoA.BFieldID,
+      particleAndVolumeFractionSoA.particleVelocitiesFieldID, pdfFieldCPUGPUID, real_t(0), real_t(0), real_t(0),
+      real_t(1.0), real_t(0), real_t(0), real_t(0));
+
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      // pdfSetter requires particle velocities at cell centers
+      psmSweepCollection.setParticleVelocitiesSweep(&(*blockIt));
+      pdfSetter(&(*blockIt));
+   }
+
+   // setup of the LBM communication for synchronizing the pdf field between neighboring blocks
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   gpu::communication::UniformGPUScheme< Stencil_T > com(blocks, 0, false);
+#else
+   walberla::blockforest::communication::UniformBufferedScheme< Stencil_T > com(blocks);
+#endif
+
+   com.addPackInfo(make_shared< PackInfo_T >(pdfFieldCPUGPUID));
+   auto communication = std::function< void() >([&]() { com.communicate(); });
+
+   // create the timeloop
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+
+   pystencils::PSMSweep PSMSweep(particleAndVolumeFractionSoA.BsFieldID, particleAndVolumeFractionSoA.BFieldID,
+                                 particleAndVolumeFractionSoA.particleForcesFieldID,
+                                 particleAndVolumeFractionSoA.particleVelocitiesFieldID, pdfFieldCPUGPUID, real_t(0.0),
+                                 real_t(0.0), real_t(0.0), lbm::collision_model::omegaFromViscosity(viscosity));
+
+   timeloop.addFuncBeforeTimeStep(RemainingTimeLogger(timeloop.getNrOfTimeSteps()), "Remaining Time Logger");
+
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   pystencils::PSM_MacroGetter getterSweep(BFieldID, densityFieldID, pdfFieldID, velFieldID, real_t(0.0), real_t(0.0),
+                                           real_t(0.0));
+#else
+   pystencils::PSM_MacroGetter getterSweep(particleAndVolumeFractionSoA.BFieldID, densityFieldID, pdfFieldCPUGPUID,
+                                           velFieldID, real_t(0.0), real_t(0.0), real_t(0.0));
+#endif
+   // vtk output
+   if (vtkIOFreq != uint_t(0))
+   {
+      // spheres
+      auto particleVtkOutput = make_shared< mesa_pd::vtk::ParticleVtkOutput >(ps);
+      particleVtkOutput->setParticleSelector([sphereShape](const mesa_pd::data::ParticleStorage::iterator& pIt) {
+         return pIt->getShapeID() == sphereShape;
+      });
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleOwner >("owner");
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleLinearVelocity >("velocity");
+      auto particleVtkWriter =
+         vtk::createVTKOutput_PointData(particleVtkOutput, "Particles", vtkIOFreq, baseFolder, "simulation_step");
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(particleVtkWriter), "VTK (sphere data)");
+
+      // flag field (written only once in the first time step, ghost layers are also written)
+      // auto flagFieldVTK = vtk::createVTKOutput_BlockData( blocks, "flag_field", timesteps, FieldGhostLayers, false,
+      // baseFolder ); flagFieldVTK->addCellDataWriter( make_shared< field::VTKWriter< FlagField_T > >( flagFieldID,
+      // "FlagField" ) ); timeloop.addFuncBeforeTimeStep( vtk::writeFiles( flagFieldVTK ), "VTK (flag field data)" );
+
+      // pdf field
+      auto pdfFieldVTK = vtk::createVTKOutput_BlockData(blocks, "fluid_field", vtkIOFreq, 0, false, baseFolder);
+
+      pdfFieldVTK->addBeforeFunction(communication);
+
+      pdfFieldVTK->addBeforeFunction([&]() {
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+         gpu::fieldCpy< PdfField_T, gpu::GPUField< real_t > >(blocks, pdfFieldID, pdfFieldCPUGPUID);
+         gpu::fieldCpy< BField_T, BFieldGPU_T >(blocks, BFieldID, particleAndVolumeFractionSoA.BFieldID);
+         gpu::fieldCpy< BsField_T, BsFieldGPU_T >(blocks, BsFieldID, particleAndVolumeFractionSoA.BsFieldID);
+#endif
+         for (auto& block : *blocks)
+            getterSweep(&block);
+      });
+
+      field::FlagFieldCellFilter< FlagField_T > fluidFilter(flagFieldID);
+      fluidFilter.addFlag(Fluid_Flag);
+      pdfFieldVTK->addCellInclusionFilter(fluidFilter);
+
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< VelocityField_T > >(velFieldID, "Velocity"));
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< DensityField_T > >(densityFieldID, "Density"));
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< BField_T > >(BFieldID, "Fraction mapping field B"));
+      pdfFieldVTK->addCellDataWriter(
+         make_shared< field::VTKWriter< BsField_T > >(BsFieldID, "Fraction mapping field Bs"));
+#else
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< BField_T > >(particleAndVolumeFractionSoA.BFieldID,
+                                                                                 "Fraction mapping field B"));
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< BsField_T > >(
+         particleAndVolumeFractionSoA.BsFieldID, "Fraction mapping field Bs"));
+#endif
+
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(pdfFieldVTK), "VTK (fluid field data)");
+   }
+
+   // add LBM communication function and boundary handling sweep (does the hydro force calculations and the no-slip
+   // treatment)
+   timeloop.add() << BeforeFunction(communication, "LBM Communication")
+                  << Sweep(noSlip.getSweep(), "Boundary Handling");
+
+   // stream + collide LBM step
+   addPSMSweepsToTimeloop(timeloop, psmSweepCollection, PSMSweep);
+
+   // evaluation functionality
+   std::string loggingFileName(baseFolder + "/LoggingSettlingSphereGPU_");
+   loggingFileName += std::to_string(fluidType);
+   loggingFileName += ".txt";
+   if (fileIO) { WALBERLA_LOG_INFO_ON_ROOT(" - writing logging output to file \"" << loggingFileName << "\""); }
+   SpherePropertyLogger< ParticleAccessor_T > logger(accessor, sphereUid, loggingFileName, fileIO, dx_SI, dt_SI,
+                                                     diameter, -gravitationalForce[2]);
+
+   ////////////////////////
+   // EXECUTE SIMULATION //
+   ////////////////////////
+
+   WcTimingPool timeloopTiming;
+
+   real_t terminationPosition = real_t(0.51) * diameter; // right before sphere touches the bottom wall
+
+   const bool useOpenMP = false;
+
+   // time loop
+   for (uint_t i = 0; i < timesteps; ++i)
+   {
+      // perform a single simulation step -> this contains LBM and setting of the hydrodynamic interactions
+      timeloop.singleStep(timeloopTiming);
+
+      timeloopTiming["RPD"].start();
+
+      // -> full hydrodynamic force/torque info is available on local particle
+      reduceProperty.operator()< mesa_pd::HydrodynamicForceTorqueNotification >(*ps);
+
+      if (averageForceTorqueOverTwoTimeSteps)
+      {
+         if (i == 0)
+         {
+            lbm_mesapd_coupling::InitializeHydrodynamicForceTorqueForAveragingKernel
+               initializeHydrodynamicForceTorqueForAveragingKernel;
+            ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor,
+                                initializeHydrodynamicForceTorqueForAveragingKernel, *accessor);
+         }
+         ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, averageHydrodynamicForceTorque,
+                             *accessor);
+      }
+
+      for (auto subCycle = uint_t(0); subCycle < numRPDSubCycles; ++subCycle)
+      {
+         if (useVelocityVerlet)
+         {
+            ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, vvIntegratorPreForce, *accessor);
+            syncCall();
+         }
+
+         ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, addHydrodynamicInteraction,
+                             *accessor);
+         ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, addGravitationalForce, *accessor);
+
+         // lubrication correction
+         ps->forEachParticlePairHalf(
+            useOpenMP, mesa_pd::kernel::ExcludeInfiniteInfinite(), *accessor,
+            [&lubricationCorrectionKernel, rpdDomain](const size_t idx1, const size_t idx2, auto& ac) {
+               // TODO change this to storing copy, not reference
+               mesa_pd::collision_detection::AnalyticContactDetection acd;
+               acd.getContactThreshold() = lubricationCorrectionKernel.getNormalCutOffDistance();
+               mesa_pd::kernel::DoubleCast double_cast;
+               mesa_pd::mpi::ContactFilter contact_filter;
+               if (double_cast(idx1, idx2, ac, acd, ac))
+               {
+                  if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *rpdDomain))
+                  {
+                     double_cast(idx1, idx2, ac, lubricationCorrectionKernel, ac, acd.getContactNormal(),
+                                 acd.getPenetrationDepth());
+                  }
+               }
+            },
+            *accessor);
+
+         // one could add linked cells here
+
+         // collision response
+         ps->forEachParticlePairHalf(
+            useOpenMP, mesa_pd::kernel::ExcludeInfiniteInfinite(), *accessor,
+            [collisionResponse, rpdDomain](const size_t idx1, const size_t idx2, auto& ac) {
+               mesa_pd::collision_detection::AnalyticContactDetection acd;
+               mesa_pd::kernel::DoubleCast double_cast;
+               mesa_pd::mpi::ContactFilter contact_filter;
+               if (double_cast(idx1, idx2, ac, acd, ac))
+               {
+                  if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *rpdDomain))
+                  {
+                     collisionResponse(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), acd.getContactNormal(),
+                                       acd.getPenetrationDepth());
+                  }
+               }
+            },
+            *accessor);
+
+         reduceProperty.operator()< mesa_pd::ForceTorqueNotification >(*ps);
+
+         if (useVelocityVerlet)
+            ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, vvIntegratorPostForce, *accessor);
+         else
+            ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, explEulerIntegrator, *accessor);
+
+         syncCall();
+      }
+
+      timeloopTiming["RPD"].end();
+
+      // evaluation
+      timeloopTiming["Logging"].start();
+      logger(i);
+      timeloopTiming["Logging"].end();
+
+      // reset after logging here
+      ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectAll(), *accessor, resetHydrodynamicForceTorque, *accessor);
+
+      // check for termination
+      if (logger.getPosition() < terminationPosition)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("Sphere reached terminal position " << logger.getPosition() << " after " << i
+                                                                       << " timesteps!");
+         break;
+      }
+   }
+
+   timeloopTiming.logResultOnRoot();
+
+   // check the result
+   if (!funcTest && !shortrun)
+   {
+      real_t relErr = std::fabs(expectedSettlingVelocity - logger.getMaxVelocity()) / expectedSettlingVelocity;
+      WALBERLA_LOG_INFO_ON_ROOT("Expected maximum settling velocity: " << expectedSettlingVelocity);
+      WALBERLA_LOG_INFO_ON_ROOT("Simulated maximum settling velocity: " << logger.getMaxVelocity());
+      WALBERLA_LOG_INFO_ON_ROOT("Relative error: " << relErr);
+
+      // the relative error has to be below 10%
+      WALBERLA_CHECK_LESS(relErr, real_t(0.1));
+   }
+
+   return EXIT_SUCCESS;
+}
+
+} // namespace settling_sphere
+
+int main(int argc, char** argv) { settling_sphere::main(argc, argv); }
diff --git a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/TorqueSpherePSM.cpp b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/TorqueSpherePSM.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..194b3eb3a7dfbfc8f356aa9108c4b70694c1241a
--- /dev/null
+++ b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/codegen/TorqueSpherePSM.cpp
@@ -0,0 +1,480 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TorqueSpherePSMGPU.cpp
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//! \brief Modification of pe_coupling/partially_saturated_cells_method/TorqueSpherePSM.cpp
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "core/Environment.h"
+#include "core/SharedFunctor.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Logging.h"
+#include "core/mpi/MPIManager.h"
+#include "core/mpi/Reduce.h"
+#include "core/timing/RemainingTimeLogger.h"
+
+#include "field/AddToStorage.h"
+
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/DeviceSelectMPI.h"
+#include "gpu/communication/UniformGPUScheme.h"
+
+#include "lbm_mesapd_coupling/DataTypesCodegen.h"
+#include "lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMSweepCollection.h"
+#include "lbm_mesapd_coupling/utility/ResetHydrodynamicForceTorqueKernel.h"
+
+#include "mesa_pd/data/ParticleAccessorWithShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/ShapeStorage.h"
+#include "mesa_pd/domain/BlockForestDomain.h"
+#include "mesa_pd/mpi/SyncNextNeighbors.h"
+
+#include <iostream>
+// codegen
+#include "InitializeDomainForPSM.h"
+#include "PSMPackInfo.h"
+#include "PSMSweep.h"
+#include "PSM_InfoHeader.h"
+
+namespace torque_sphere_psm
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+using walberla::uint_t;
+using namespace lbm_mesapd_coupling::psm::gpu;
+
+using flag_t      = walberla::uint8_t;
+using FlagField_T = FlagField< flag_t >;
+
+typedef pystencils::PSMPackInfo PackInfo_T;
+
+///////////
+// FLAGS //
+///////////
+
+const FlagUID Fluid_Flag("fluid");
+
+////////////////
+// PARAMETERS //
+////////////////
+
+struct Setup
+{
+   uint_t checkFrequency;
+   real_t visc;
+   real_t tau;
+   real_t radius;
+   uint_t length;
+   real_t phi;
+   real_t angularVel;
+   real_t analyticalTorque;
+};
+
+//*******************************************************************************************************************
+/*!\brief Evaluating the torque on a sphere, rotating with a constant angular velocity
+ */
+//*******************************************************************************************************************
+template< typename ParticleAccessor_T >
+class TorqueEval
+{
+ public:
+   TorqueEval(SweepTimeloop* timeloop, Setup* setup, const shared_ptr< StructuredBlockStorage >& blocks,
+              const shared_ptr< ParticleAccessor_T >& ac, bool fileIO)
+      : timeloop_(timeloop), setup_(setup), blocks_(blocks), ac_(ac), fileIO_(fileIO), torqueOld_(0.0), torqueNew_(0.0)
+   {
+      // calculate the (semi)analytical torque value
+      // see also Hofmann et al. - Hydrodynamic interactions in colloidal crystals:(II). Application to dense cubic and
+      // tetragonal arrays (1999), Eqs. 5.1 and 5.5
+      const real_t S = real_c(1.95708);
+      setup_->analyticalTorque =
+         -setup_->visc *
+         (real_c(6) * setup_->phi / (real_c(1) - setup_->phi - S * std::pow(setup_->phi, real_c(10. / 3.)))) *
+         setup_->angularVel * real_c(setup_->length * setup_->length * setup_->length);
+
+      if (fileIO_)
+      {
+         std::ofstream file;
+         filename_ = "TorqueSpherePSMGPU.txt";
+         WALBERLA_ROOT_SECTION()
+         {
+            file.open(filename_.c_str());
+            file << "#\t torqueSim\t torqueAnaly\n";
+            file.close();
+         }
+      }
+   }
+
+   // evaluate the acting torque
+   void operator()()
+   {
+      const uint_t timestep(timeloop_->getCurrentTimeStep() + 1);
+
+      if (timestep % setup_->checkFrequency != 0) return;
+
+      // update torque values
+      torqueOld_ = torqueNew_;
+      torqueNew_ = calculateTorque();
+
+      // write to file if desired
+      WALBERLA_ROOT_SECTION()
+      {
+         if (fileIO_)
+         {
+            std::ofstream file;
+            file.open(filename_.c_str(), std::ofstream::app);
+            file.setf(std::ios::unitbuf);
+            file.precision(15);
+            file << timestep << " " << torqueNew_ << " " << setup_->analyticalTorque << "\n";
+            file.close();
+         }
+      }
+   }
+
+   // obtain the torque acting on the sphere by summing up all the process local parts
+   real_t calculateTorque()
+   {
+      real_t torque = real_c(0);
+      for (auto blockIt = blocks_->begin(); blockIt != blocks_->end(); ++blockIt)
+      {
+         for (size_t idx = 0; idx < ac_->size(); ++idx)
+         {
+            torque += ac_->getHydrodynamicTorque(idx)[1];
+         }
+      }
+
+      WALBERLA_MPI_SECTION() { mpi::allReduceInplace(torque, mpi::SUM); }
+      return torque;
+   }
+
+   // return the relative temporal change in the torque
+   real_t getTorqueDiff() const { return std::fabs((torqueNew_ - torqueOld_) / torqueNew_); }
+
+   // return the torque
+   real_t getTorque() const { return torqueNew_; }
+
+ private:
+   SweepTimeloop* timeloop_;
+
+   Setup* setup_;
+
+   shared_ptr< StructuredBlockStorage > blocks_;
+   shared_ptr< ParticleAccessor_T > ac_;
+
+   bool fileIO_;
+   std::string filename_;
+
+   real_t torqueOld_;
+   real_t torqueNew_;
+};
+
+//////////
+// MAIN //
+//////////
+
+//*******************************************************************************************************************
+/*!\brief Testcase that checks the torque acting on a constantly rotating sphere in the center of a cubic domain
+ *
+ * The torque for this problem (often denoted as Simple Cubic setup) is given by a semi-analytical formula.
+ * The cubic domain is periodic in all directions, making it a physically infinite periodic array of spheres.
+   \verbatim
+         _______________
+        |       <-      |
+        |      ___      |
+        |     /   \     |
+        |    |  x  |    |
+        |     \___/     |
+        |      ->       |
+        |_______________|
+
+   \endverbatim
+ *
+ * The collision model used for the LBM is TRT with a relaxation parameter tau=1.5 and the magic parameter 3/16.
+ * The Stokes approximation of the equilibrium PDFs is used.
+ * The sphere rotates with a angular velocity of 1e-5.
+ * The domain is 32x32x32, and the sphere has a diameter of around 27 cells ( chi * domainlength )
+ * The simulation is run until the relative change in the torque between 100 time steps is less than 1e-5.
+ * The pe is not used since the angular velocity is kept constant and the force is explicitly reset after each time
+ step.
+ *
+ */
+//*******************************************************************************************************************
+
+int main(int argc, char** argv)
+{
+   debug::enterTestMode();
+
+   mpi::Environment env(argc, argv);
+
+   logging::Logging::instance()->setLogLevel(logging::Logging::INFO);
+
+   auto processes = MPIManager::instance()->numProcesses();
+
+   if (processes != 1 && processes != 2 && processes != 4 && processes != 8)
+   {
+      std::cerr << "Number of processes must be equal to either 1, 2, 4, or 8!" << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   ///////////////////
+   // Customization //
+   ///////////////////
+
+   bool shortrun = false;
+   bool funcTest = false;
+   bool fileIO   = false;
+
+   for (int i = 1; i < argc; ++i)
+   {
+      if (std::strcmp(argv[i], "--shortrun") == 0)
+      {
+         shortrun = true;
+         continue;
+      }
+      if (std::strcmp(argv[i], "--funcTest") == 0)
+      {
+         funcTest = true;
+         continue;
+      }
+      if (std::strcmp(argv[i], "--fileIO") == 0)
+      {
+         fileIO = true;
+         continue;
+      }
+      WALBERLA_ABORT("Unrecognized command line argument found: " << argv[i]);
+   }
+
+   ///////////////////////////
+   // SIMULATION PROPERTIES //
+   ///////////////////////////
+
+   Setup setup;
+
+   setup.length         = uint_t(32);   // length of the cubic domain in lattice cells
+   const real_t chi     = real_c(0.85); // porosity parameter: diameter / length
+   setup.tau            = real_c(1.5);  // relaxation time
+   setup.angularVel     = real_c(1e-5); // angular velocity of the sphere
+   setup.checkFrequency = uint_t(100);  // evaluate the torque only every checkFrequency time steps
+   setup.radius         = real_c(0.5) * chi * real_c(setup.length); // sphere radius
+   setup.visc           = (setup.tau - real_c(0.5)) / real_c(3);    // viscosity in lattice units
+   setup.phi            = real_c(4.0 / 3.0) * math::pi * setup.radius * setup.radius * setup.radius /
+               (real_c(setup.length * setup.length * setup.length)); // solid volume fraction
+   const real_t omega            = real_c(1) / setup.tau;            // relaxation rate
+   const real_t dx               = real_c(1);                        // lattice dx
+   const real_t convergenceLimit = real_c(1e-5);                     // tolerance for relative change in torque
+   const uint_t timesteps =
+      funcTest ? 1 : (shortrun ? uint_c(150) : uint_c(5000)); // maximum number of time steps for the whole simulation
+
+   ///////////////////////////
+   // BLOCK STRUCTURE SETUP //
+   ///////////////////////////
+
+   const uint_t XBlocks = (processes >= 2) ? uint_t(2) : uint_t(1);
+   const uint_t YBlocks = (processes >= 4) ? uint_t(2) : uint_t(1);
+   const uint_t ZBlocks = (processes == 8) ? uint_t(2) : uint_t(1);
+   const uint_t XCells  = setup.length / XBlocks;
+   const uint_t YCells  = setup.length / YBlocks;
+   const uint_t ZCells  = setup.length / ZBlocks;
+
+   // create fully periodic domain
+   auto blocks = blockforest::createUniformBlockGrid(XBlocks, YBlocks, ZBlocks, XCells, YCells, ZCells, dx, true, true,
+                                                     true, true);
+
+   ////////
+   // PE //
+   ////////
+
+   auto mesapdDomain        = std::make_shared< mesa_pd::domain::BlockForestDomain >(blocks->getBlockForestPointer());
+   auto ps                  = std::make_shared< mesa_pd::data::ParticleStorage >(1);
+   auto ss                  = std::make_shared< mesa_pd::data::ShapeStorage >();
+   using ParticleAccessor_T = mesa_pd::data::ParticleAccessorWithShape;
+   auto accessor            = walberla::make_shared< ParticleAccessor_T >(ps, ss);
+
+   /////////////////
+   // PE COUPLING //
+   /////////////////
+
+   // connect to pe
+   const real_t overlap = real_c(1.5) * dx;
+
+   if (setup.radius > real_c(setup.length) * real_c(0.5) - overlap)
+   {
+      std::cerr << "Periodic sphere is too large!" << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   // create the sphere in the middle of the domain
+   Vector3< real_t > position(real_c(setup.length) * real_c(0.5));
+   auto sphereShape = ss->create< mesa_pd::data::Sphere >(setup.radius);
+
+   if (mesapdDomain->isContainedInProcessSubdomain(uint_c(walberla::mpi::MPIManager::instance()->rank()), position))
+   {
+      auto sphereParticle = ps->create();
+      sphereParticle->setShapeID(sphereShape);
+      sphereParticle->setType(0);
+      sphereParticle->setPosition(position);
+      sphereParticle->setAngularVelocity(Vector3(real_c(0), setup.angularVel, real_c(0)));
+      sphereParticle->setOwner(walberla::MPIManager::instance()->rank());
+      sphereParticle->setInteractionRadius(setup.radius);
+   }
+
+   // synchronize often enough for large particles
+   std::function< void(void) > syncCall = [&]() {
+      mesa_pd::mpi::SyncNextNeighbors syncNextNeighborFunc;
+      syncNextNeighborFunc(*ps, *mesapdDomain);
+   };
+
+   syncCall();
+
+   ///////////////////////
+   // ADD DATA TO BLOCKS //
+   ////////////////////////
+
+   // add fields ( uInit = <0,0,0>, rhoInit = 1 )
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   BlockDataID pdfFieldID =
+      field::addToStorage< PdfField_T >(blocks, "pdf field (fzyx)", real_c(std::nan("")), field::fzyx);
+   BlockDataID pdfFieldCPUGPUID = gpu::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldID, "pdf field GPU", true);
+#else
+   BlockDataID pdfFieldCPUGPUID =
+      field::addToStorage< PdfField_T >(blocks, "pdf field CPU", real_c(std::nan("")), field::fzyx);
+#endif
+
+   // add particle and volume fraction data structures
+   ParticleAndVolumeFractionSoA_T< Weighting > particleAndVolumeFractionSoA(blocks, omega);
+   // map particles and calculate solid volume fraction initially
+   PSMSweepCollection psmSweepCollection(blocks, accessor, lbm_mesapd_coupling::RegularParticlesSelector(),
+                                         particleAndVolumeFractionSoA, Vector3(8));
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      psmSweepCollection.particleMappingSweep(&(*blockIt));
+   }
+
+   pystencils::InitializeDomainForPSM pdfSetter(
+      particleAndVolumeFractionSoA.BsFieldID, particleAndVolumeFractionSoA.BFieldID,
+      particleAndVolumeFractionSoA.particleVelocitiesFieldID, pdfFieldCPUGPUID, real_t(0), real_t(0), real_t(0),
+      real_t(1.0), real_t(0), real_t(0), real_t(0));
+
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      // pdfSetter requires particle velocities at cell centers
+      psmSweepCollection.setParticleVelocitiesSweep(&(*blockIt));
+      pdfSetter(&(*blockIt));
+   }
+
+   ///////////////
+   // TIME LOOP //
+   ///////////////
+
+   // create the timeloop
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+
+   // setup of the LBM communication for synchronizing the pdf field between neighboring blocks
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   gpu::communication::UniformGPUScheme< Stencil_T > com(blocks, 0, false);
+#else
+   walberla::blockforest::communication::UniformBufferedScheme< Stencil_T > com(blocks);
+#endif
+   com.addPackInfo(make_shared< PackInfo_T >(pdfFieldCPUGPUID));
+   auto communication = std::function< void() >([&]() { com.communicate(); });
+
+   using TorqueEval_T                    = TorqueEval< ParticleAccessor_T >;
+   shared_ptr< TorqueEval_T > torqueEval = make_shared< TorqueEval_T >(&timeloop, &setup, blocks, accessor, fileIO);
+
+   pystencils::PSMSweep PSMSweep(particleAndVolumeFractionSoA.BsFieldID, particleAndVolumeFractionSoA.BFieldID,
+                                 particleAndVolumeFractionSoA.particleForcesFieldID,
+                                 particleAndVolumeFractionSoA.particleVelocitiesFieldID, pdfFieldCPUGPUID, real_t(0.0),
+                                 real_t(0.0), real_t(0.0), omega);
+
+   // communication, streaming and force evaluation
+   timeloop.add() << BeforeFunction(communication, "LBM Communication")
+                  << Sweep(deviceSyncWrapper(psmSweepCollection.setParticleVelocitiesSweep),
+                           "setParticleVelocitiesSweep");
+   timeloop.add() << Sweep(deviceSyncWrapper(PSMSweep), "cell-wise LB sweep");
+   timeloop.add() << Sweep(deviceSyncWrapper(psmSweepCollection.reduceParticleForcesSweep), "Reduce particle forces");
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   timeloop.add() << Sweep(gpu::fieldCpyFunctor< PdfField_T, gpu::GPUField< real_t > >(pdfFieldID, pdfFieldCPUGPUID),
+                           "copy pdf from GPU to CPU")
+#else
+   struct emptySweep
+   {
+      void operator()(IBlock*) {}
+   };
+   timeloop.add() << Sweep(emptySweep(), "emptySweep")
+#endif
+                  << AfterFunction(SharedFunctor< TorqueEval_T >(torqueEval), "torque evaluation");
+
+   lbm_mesapd_coupling::ResetHydrodynamicForceTorqueKernel resetHydrodynamicForceTorque;
+
+   timeloop.addFuncAfterTimeStep(RemainingTimeLogger(timeloop.getNrOfTimeSteps()), "Remaining Time Logger");
+
+   ////////////////////////
+   // EXECUTE SIMULATION //
+   ////////////////////////
+
+   WcTimingPool timeloopTiming;
+
+   // time loop
+   for (uint_t i = 0; i < timesteps; ++i)
+   {
+      // perform a single simulation step
+      timeloop.singleStep(timeloopTiming);
+
+      // resetting force
+      ps->forEachParticle(false, mesa_pd::kernel::SelectAll(), *accessor, resetHydrodynamicForceTorque, *accessor);
+
+      // check if the relative change in the torque is below the specified convergence criterion
+      if (i > setup.checkFrequency && torqueEval->getTorqueDiff() < convergenceLimit)
+      {
+         // if simulation has converged, terminate simulation
+         break;
+      }
+   }
+
+   timeloopTiming.logResultOnRoot();
+
+   // check the result
+   if (!funcTest && !shortrun)
+   {
+      real_t relErr = std::fabs((setup.analyticalTorque - torqueEval->getTorque()) / setup.analyticalTorque);
+      if (fileIO)
+      {
+         WALBERLA_ROOT_SECTION()
+         {
+            std::cout << "Analytical torque: " << setup.analyticalTorque << "\n"
+                      << "Simulated torque: " << torqueEval->getTorque() << "\n"
+                      << "Relative error: " << relErr << "\n";
+         }
+      }
+      // the relative error has to be below 10% (25% for SC2)
+      WALBERLA_CHECK_LESS(relErr, (SC == 2) ? real_c(0.25) : real_c(0.1));
+   }
+
+   return 0;
+}
+
+} // namespace torque_sphere_psm
+
+int main(int argc, char** argv) { torque_sphere_psm::main(argc, argv); }
diff --git a/tests/mesa_pd/CMakeLists.txt b/tests/mesa_pd/CMakeLists.txt
index 423dda6ef8a2811cd5225d8e34ed12752283c07a..7b217a13677f9c56c79a81cccd0ff007e24d99dd 100644
--- a/tests/mesa_pd/CMakeLists.txt
+++ b/tests/mesa_pd/CMakeLists.txt
@@ -4,6 +4,8 @@
 #
 ###################################################################################################
 
+waLBerla_link_files_to_builddir(*.prm)
+
 waLBerla_compile_test( NAME   MESA_PD_COLLISIONDETECTION_AnalyticCollisionFunctions FILES collision_detection/AnalyticCollisionFunctions.cpp DEPENDS core )
 waLBerla_execute_test( NAME   MESA_PD_COLLISIONDETECTION_AnalyticCollisionFunctions )
 
@@ -224,6 +226,8 @@ waLBerla_execute_test( NAME   MESA_PD_MPI_VelocityCorrectionNotification PROCESS
 waLBerla_compile_test( NAME   MESA_PD_Sorting FILES Sorting.cpp DEPENDS core )
 waLBerla_execute_test( NAME   MESA_PD_Sorting )
 
+waLBerla_compile_test( NAME   MESA_PD_Stiffness FILES Stiffness.cpp DEPENDS blockforest core mesa_pd )
+
 waLBerla_compile_test( NAME   MESA_PD_VTK_Outputs FILES vtk/VTKOutputs.cpp DEPENDS blockforest core vtk )
 waLBerla_execute_test( NAME   MESA_PD_VTK_Outputs PROCESSES 8 )
 
diff --git a/tests/mesa_pd/Stiffness.cpp b/tests/mesa_pd/Stiffness.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ebf03ba6d74079043f4b70b0d0e4312b52fc52f
--- /dev/null
+++ b/tests/mesa_pd/Stiffness.cpp
@@ -0,0 +1,142 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file   Stiffness.cpp
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+
+#include "core/Environment.h"
+
+#include "mesa_pd/collision_detection/AnalyticContactDetection.h"
+#include "mesa_pd/data/DataTypes.h"
+#include "mesa_pd/data/ParticleAccessorWithBaseShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/shape/Sphere.h"
+#include "mesa_pd/domain/BlockForestDomain.h"
+#include "mesa_pd/kernel/DoubleCast.h"
+#include "mesa_pd/kernel/LinearSpringDashpot.h"
+#include "mesa_pd/kernel/ParticleSelector.h"
+#include "mesa_pd/kernel/VelocityVerlet.h"
+#include "mesa_pd/mpi/ContactFilter.h"
+
+namespace walberla
+{
+
+using namespace mesa_pd;
+
+int main(int argc, char** argv)
+{
+   Environment env(argc, argv);
+   walberla::mpi::MPIManager::instance()->useWorldComm();
+
+   WALBERLA_CHECK(MPIManager::instance()->numProcesses() == 1)
+
+   // Config
+   auto cfg = env.config();
+   if (cfg == nullptr) WALBERLA_ABORT("No config specified!");
+   WALBERLA_LOG_INFO_ON_ROOT(*cfg);
+   const Config::BlockHandle config = cfg->getBlock("Stiffness");
+
+   const Vec3 domainSize_SI             = config.getParameter< Vec3 >("domainSize_SI");
+   const real_t diameter_SI             = config.getParameter< real_t >("diameter_SI");
+   const real_t densityParticle_SI      = config.getParameter< real_t >("densityParticle_SI");
+   const real_t dt_SI                   = config.getParameter< real_t >("dt_SI");
+   const uint_t timeSteps               = config.getParameter< uint_t >("timeSteps");
+   const real_t force_SI                = config.getParameter< real_t >("force_SI");
+   const real_t normalSpringConstant_SI = config.getParameter< real_t >("normalSpringConstant_SI");
+
+   // BlockForest
+   const math::AABB simulationDomain_SI(real_t(0.0), real_t(0.0), real_t(0.0), domainSize_SI[0], domainSize_SI[1],
+                                        domainSize_SI[2]);
+
+   shared_ptr< BlockForest > forest =
+      blockforest::createBlockForest(simulationDomain_SI, Vec3(uint(1)), Vector3< bool >(false));
+   auto domain = std::make_shared< mesa_pd::domain::BlockForestDomain >(forest);
+
+   // MesaPD data structures
+   auto ps = std::make_shared< data::ParticleStorage >(1);
+   data::ParticleAccessorWithBaseShape accessor(ps);
+
+   // Init sphere 0
+   auto p0                       = ps->create();
+   p0->getPositionRef()          = simulationDomain_SI.center() - Vec3(diameter_SI / 2, real_t(0), real_t(0));
+   p0->getInteractionRadiusRef() = diameter_SI * real_t(0.5);
+   p0->getBaseShapeRef()         = std::make_shared< data::Sphere >(p0->getInteractionRadius());
+   p0->getBaseShapeRef()->updateMassAndInertia(densityParticle_SI);
+   p0->getOwnerRef() = walberla::mpi::MPIManager::instance()->rank();
+   p0->getTypeRef()  = 0;
+   auto idxp0        = p0->getIdx();
+
+   // Init sphere 1
+   auto p1                       = ps->create();
+   p1->getPositionRef()          = simulationDomain_SI.center() + Vec3(diameter_SI / 2, real_t(0), real_t(0));
+   p1->getInteractionRadiusRef() = diameter_SI * real_t(0.5);
+   p1->getBaseShapeRef()         = std::make_shared< data::Sphere >(p1->getInteractionRadius());
+   p1->getBaseShapeRef()->updateMassAndInertia(densityParticle_SI);
+   p1->getOwnerRef() = walberla::mpi::MPIManager::instance()->rank();
+   p1->getTypeRef()  = 0;
+   auto idxp1        = p1->getIdx();
+
+   auto overlap = diameter_SI - (accessor.getPosition(idxp1)[0] - accessor.getPosition(idxp0)[0]);
+
+   // Init kernels
+   mesa_pd::kernel::VelocityVerletPreForceUpdate vvIntegratorPreForce(dt_SI);
+   mesa_pd::kernel::VelocityVerletPostForceUpdate vvIntegratorPostForce(dt_SI);
+   kernel::LinearSpringDashpot dem(1);
+   dem.setStiffnessN(0, 0, normalSpringConstant_SI);
+
+   for (uint_t i = 0; i < timeSteps; ++i)
+   {
+      ps->forEachParticle(false, kernel::SelectLocal(), accessor, vvIntegratorPreForce, accessor);
+
+      p0->setForce(Vec3(force_SI * real_t(i) / real_t(timeSteps), real_t(0), real_t(0)));
+      p1->setForce(Vec3(-force_SI * real_t(i) / real_t(timeSteps), real_t(0), real_t(0)));
+
+      ps->forEachParticlePairHalf(
+         false, kernel::ExcludeInfiniteInfinite(), accessor,
+         [domain, &dem, dt_SI](const size_t idx1, const size_t idx2, auto& ac) {
+            kernel::DoubleCast double_cast;
+            mesa_pd::mpi::ContactFilter contact_filter;
+            collision_detection::AnalyticContactDetection acd;
+
+            if (double_cast(idx1, idx2, ac, acd, ac))
+            {
+               if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *domain))
+               {
+                  dem(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), acd.getContactNormal(),
+                      acd.getPenetrationDepth(), dt_SI);
+               }
+            }
+         },
+         accessor);
+
+      overlap = diameter_SI - (accessor.getPosition(idxp1)[0] - accessor.getPosition(idxp0)[0]);
+
+      ps->forEachParticle(false, kernel::SelectLocal(), accessor, vvIntegratorPostForce, accessor);
+   }
+
+   WALBERLA_LOG_DEVEL_VAR(overlap)
+   const real_t expectedOverlap = force_SI / normalSpringConstant_SI;
+   WALBERLA_LOG_DEVEL_VAR(expectedOverlap)
+   WALBERLA_CHECK_FLOAT_EQUAL(overlap, expectedOverlap)
+
+   return EXIT_SUCCESS;
+}
+} // namespace walberla
+
+int main(int argc, char** argv) { return walberla::main(argc, argv); }
diff --git a/tests/mesa_pd/Stiffness.prm b/tests/mesa_pd/Stiffness.prm
new file mode 100644
index 0000000000000000000000000000000000000000..e943ae84e8b6395b9f2eb3040131def2d4884368
--- /dev/null
+++ b/tests/mesa_pd/Stiffness.prm
@@ -0,0 +1,9 @@
+Stiffness{
+    domainSize_SI < 1, 1, 1 >;
+    diameter_SI 0.01;
+    densityParticle_SI 2500;
+    dt_SI 5e-5;
+    timeSteps 1000000;
+    force_SI 1;
+    normalSpringConstant_SI 1000;
+}