diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt
index cbdd59565c1b497600a7bb815d60be6b0b4533e9..10176106b6d3bbe2ad5f9d64aedc25d09deb19f5 100644
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -30,6 +30,7 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
       add_subdirectory( PhaseFieldAllenCahn )
       add_subdirectory( NonUniformGridCPU )
       add_subdirectory( PSM_Moving_Geometry )
+      add_subdirectory( PSM_Benchmark_Test )
    endif()
 
    if ( WALBERLA_BUILD_WITH_CODEGEN AND WALBERLA_BUILD_WITH_GPU_SUPPORT )
diff --git a/apps/benchmarks/PSM_Benchmark_Test/CMakeLists.txt b/apps/benchmarks/PSM_Benchmark_Test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dd15b401af7a1b3384261c0d1658e6a319fafd62
--- /dev/null
+++ b/apps/benchmarks/PSM_Benchmark_Test/CMakeLists.txt
@@ -0,0 +1,26 @@
+waLBerla_link_files_to_builddir( *.prm )
+waLBerla_link_files_to_builddir( *.png )
+waLBerla_link_files_to_builddir( *.obj )
+waLBerla_link_files_to_builddir( *.stl )
+
+
+walberla_generate_target_from_python( NAME PSM_Benchmark_TestCodeGen
+        FILE PSM_Benchmark_Test.py
+        OUT_FILES PSM_Benchmark_TestStorageSpecification.h PSM_Benchmark_TestStorageSpecification.${CODEGEN_FILE_SUFFIX}
+        PSM_Benchmark_TestSweepCollection.h PSM_Benchmark_TestSweepCollection.${CODEGEN_FILE_SUFFIX}
+        PSM_Benchmark_TestBoundaryCollection.h
+        NoSlip.${CODEGEN_FILE_SUFFIX} NoSlip.h
+        PSM_Benchmark_Test_InfoHeader.h
+        PSM_Conditional_Sweep.${CODEGEN_FILE_SUFFIX} PSM_Conditional_Sweep.h
+        )
+
+
+if(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+    walberla_add_executable ( NAME PSM_Benchmark_Test
+            FILES PSM_Benchmark_Test.cpp ../PSM_Complex_Geometry/MovingGeometry.h ../PSM_Complex_Geometry/MovingGeometry.cu
+            DEPENDS blockforest gpu core domain_decomposition field geometry mesh mesh_common timeloop lbm stencil vtk PSM_Benchmark_TestCodeGen )
+else()
+    walberla_add_executable ( NAME PSM_Benchmark_Test
+            FILES PSM_Benchmark_Test.cpp ../PSM_Complex_Geometry/MovingGeometry.h
+            DEPENDS blockforest core domain_decomposition field geometry mesh mesh_common timeloop lbm stencil vtk PSM_Benchmark_TestCodeGen )
+endif()
\ No newline at end of file
diff --git a/apps/benchmarks/PSM_Benchmark_Test/PSM_Benchmark_Test.cpp b/apps/benchmarks/PSM_Benchmark_Test/PSM_Benchmark_Test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..489961fb99f49117e9a1861eb4fc86a5de8e530e
--- /dev/null
+++ b/apps/benchmarks/PSM_Benchmark_Test/PSM_Benchmark_Test.cpp
@@ -0,0 +1,275 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PSM_Benchmark_Test.cpp
+//! \author Philipp Suffa <philipp.suffa@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+#include "core/all.h"
+#include "domain_decomposition/all.h"
+#include "field/all.h"
+#include "field/vtk/VTKWriter.h"
+#include "geometry/all.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/communication/UniformGeneratedPdfPackInfo.h"
+#include "stencil/D3Q19.h"
+#include "timeloop/all.h"
+#include <random>
+
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+#   include "gpu/communication/UniformGPUScheme.h"
+#   include "lbm_generated/gpu/AddToStorage.h"
+#   include "lbm_generated/gpu/GPUPdfField.h"
+#   include "lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h"
+#endif
+#include "../PSM_Complex_Geometry/MovingGeometry.h"
+#include "PSM_Benchmark_Test_InfoHeader.h"
+
+namespace walberla
+{
+///////////////////////
+/// Typedef Aliases ///
+///////////////////////
+
+using StorageSpecification_T = lbm::PSM_Benchmark_TestStorageSpecification;
+using Stencil_T = lbm::PSM_Benchmark_TestStorageSpecification::Stencil;
+using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
+using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >;
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+using GPUPdfField_T = lbm_generated::GPUPdfField< StorageSpecification_T >;
+using gpu::communication::UniformGPUScheme;
+#endif
+
+typedef walberla::uint8_t flag_t;
+typedef FlagField< flag_t > FlagField_T;
+using BoundaryCollection_T = lbm::PSM_Benchmark_TestBoundaryCollection< FlagField_T >;
+using SweepCollection_T = lbm::PSM_Benchmark_TestSweepCollection;
+const FlagUID fluidFlagUID("Fluid");
+using blockforest::communication::UniformBufferedScheme;
+
+/////////////////////
+/// Main Function ///
+/////////////////////
+
+auto deviceSyncWrapper = [](std::function< void(IBlock*) > sweep) {
+   return [sweep](IBlock* b) {
+      sweep(b);
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+      cudaDeviceSynchronize();
+#endif
+   };
+};
+
+
+
+int main(int argc, char** argv)
+{
+   walberla::Environment walberlaEnv(argc, argv);
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   gpu::selectDeviceBasedOnMpiRank();
+   WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+#endif
+
+   logging::Logging::instance()->setLogLevel( logging::Logging::INFO );
+
+   mpi::MPIManager::instance()->useWorldComm();
+
+   ///////////////////////
+   /// PARAMETER INPUT ///
+   ///////////////////////
+
+   const uint_t timesteps = 1001;
+   const Vector3< real_t > initialVelocity =  Vector3< real_t >(0.0);
+   const real_t omega = 1.4;
+
+   const real_t remainingTimeLoggerFrequency = real_c(5.0);
+   const uint_t VTKWriteFrequency = 0;
+
+
+   ////////////////////
+   /// PROCESS MESH ///
+   ////////////////////
+
+   auto numBlocks = Vector3<uint_t> (1,1,1);
+   auto cellsPerBlock = Vector3<uint_t>(128, 128, 128);
+   auto domainAABB = AABB(0,0,0,1,1,1);
+
+   auto blocks = walberla::blockforest::createUniformBlockGrid(domainAABB, numBlocks[0], numBlocks[1], numBlocks[2], cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2], true);
+
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+
+   /////////////////////////
+   /// Boundary Handling ///
+   /////////////////////////
+
+
+   const BlockDataID fractionFieldId = field::addToStorage< FracField_T >(blocks, "fractionField", real_t(0.0), field::fzyx, uint_c(1));
+   const BlockDataID objectVelocitiesFieldId = field::addToStorage< VectorField_T >(blocks, "particleVelocitiesField", real_c(0.0), field::fzyx, uint_c(1));
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   const BlockDataID fractionFieldGPUId = gpu::addGPUFieldToStorage< FracField_T >(blocks, fractionFieldId, "fractionFieldGPU", true);
+   const BlockDataID objectVelocitiesFieldGPUId = gpu::addGPUFieldToStorage< VectorField_T >(blocks, objectVelocitiesFieldId, "object velocity field on GPU", true);
+#endif
+
+
+
+   /////////////////////////
+   /// Fields Creation   ///
+   /////////////////////////
+
+   const StorageSpecification_T StorageSpec = StorageSpecification_T();
+   const BlockDataID pdfFieldId  = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, uint_c(1), field::fzyx);
+   const BlockDataID velocityFieldId = field::addToStorage< VectorField_T >(blocks, "velocity", real_t(0.0), field::fzyx, uint_c(1));
+   const BlockDataID densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx, uint_c(1));
+   const BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flagField");
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   const BlockDataID pdfFieldGPUId = lbm_generated::addGPUPdfFieldToStorage< PdfField_T >(blocks, pdfFieldId, StorageSpec, "pdf field on GPU", true);
+   const BlockDataID velocityFieldGPUId = gpu::addGPUFieldToStorage< VectorField_T >(blocks, velocityFieldId, "velocity field on GPU", true);
+   const BlockDataID densityFieldGPUId = gpu::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldId, "density field on GPU", true);
+#endif
+
+
+   /////////////
+   /// Sweep ///
+   /////////////
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   SweepCollection_T sweepCollection(blocks, fractionFieldGPUId, objectVelocitiesFieldGPUId, pdfFieldGPUId, densityFieldGPUId, velocityFieldGPUId, omega);
+   pystencils::PSM_Conditional_Sweep psmConditionalSweep( fractionFieldGPUId, objectVelocitiesFieldGPUId, pdfFieldGPUId, omega );
+#else
+   SweepCollection_T sweepCollection(blocks, fractionFieldId, objectVelocitiesFieldId, pdfFieldId, densityFieldId, velocityFieldId, omega);
+   pystencils::PSM_Conditional_Sweep psmConditionalSweep( fractionFieldId, objectVelocitiesFieldId, pdfFieldId, omega );
+#endif
+
+   for (auto& block : *blocks)
+   {
+      auto velField = block.getData<VectorField_T>(velocityFieldId);
+      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField, velField->get(x,y,z,0) = initialVelocity[0];)
+
+      auto fractionField = block.getData<FracField_T>(fractionFieldId);
+      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(fractionField, fractionField->get(x,y,z,0) = ((double) rand() / (RAND_MAX));)
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+      gpu::GPUField<real_t> * dstVel = block.getData<gpu::GPUField<real_t>>( velocityFieldGPUId );
+      const VectorField_T * srcVel = block.getData<VectorField_T>( velocityFieldId );
+      gpu::fieldCpy( *dstVel, *srcVel );
+
+      gpu::GPUField<real_t> * dstFrac = block.getData<gpu::GPUField<real_t>>( fractionFieldGPUId );
+      const FracField_T * srcFrac = block.getData<FracField_T>( fractionFieldId );
+      gpu::fieldCpy( *dstFrac, *srcFrac );
+#endif
+      sweepCollection.initialise(&block, 1);
+   }
+
+   /////////////////////
+   /// Communication ///
+   ////////////////////
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   const bool sendDirectlyFromGPU = false;
+   UniformGPUScheme< Stencil_T > communication(blocks, sendDirectlyFromGPU);
+   auto packInfo = std::make_shared<lbm_generated::UniformGeneratedGPUPdfPackInfo< GPUPdfField_T >>(pdfFieldGPUId);
+   communication.addPackInfo(packInfo);
+#else
+   UniformBufferedScheme< Stencil_T > communication(blocks);
+   auto packInfo = std::make_shared<lbm_generated::UniformGeneratedPdfPackInfo< PdfField_T >>(pdfFieldId);
+   communication.addPackInfo(packInfo);
+#endif
+
+   /////////////////
+   /// Time Loop ///
+   /////////////////
+
+   const auto emptySweep = [](IBlock*) {};
+
+   //timeloop.add() << BeforeFunction(communication.getCommunicateFunctor(), "Communication")
+   //               << Sweep(deviceSyncWrapper(boundaryCollection.getSweep(BoundaryCollection_T::ALL)), "Boundary Conditions");
+   timeloop.add() << Sweep(deviceSyncWrapper(psmConditionalSweep), "PSMConditionalSweep");
+
+   timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency), "remaining time logger");
+
+
+   /////////////////
+   /// VTK       ///
+   /////////////////
+
+   if (VTKWriteFrequency > 0)
+   {
+      const std::string path = "vtk_out";
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "fields", VTKWriteFrequency, 0, false, path, "simulation_step", false, true, true, false, 0);
+
+      vtkOutput->addBeforeFunction([&]() {
+         for (auto& block : *blocks)
+            sweepCollection.calculateMacroscopicParameters(&block);
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+         gpu::fieldCpy< VectorField_T, gpu::GPUField< real_t > >(blocks, velocityFieldId, velocityFieldGPUId);
+         gpu::fieldCpy< FracField_T, gpu::GPUField< real_t > >(blocks, fractionFieldId, fractionFieldGPUId);
+         gpu::fieldCpy< VectorField_T, gpu::GPUField< real_t > >(blocks, objectVelocitiesFieldId, objectVelocitiesFieldGPUId);
+
+#endif
+      });
+
+      auto velWriter = make_shared< field::VTKWriter< VectorField_T > >(velocityFieldId, "Velocity");
+      auto fractionFieldWriter = make_shared< field::VTKWriter< FracField_T > >(fractionFieldId, "FractionField");
+      auto objVeldWriter = make_shared< field::VTKWriter< VectorField_T > >(objectVelocitiesFieldId, "objectVelocity");
+
+      vtkOutput->addCellDataWriter(velWriter);
+      vtkOutput->addCellDataWriter(fractionFieldWriter);
+      vtkOutput->addCellDataWriter(objVeldWriter);
+
+      timeloop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+      vtk::writeDomainDecomposition(blocks, "domain_decomposition", "vtk_out", "write_call", true, true, 0);
+   }
+
+
+   for (auto& block : *blocks)
+   {
+      FlagField_T *flagField = block.getData<FlagField_T>(flagFieldId);
+      flagField->registerFlag(fluidFlagUID);
+   }
+
+   /////////////////
+   /// TIMING    ///
+   /////////////////
+
+   lbm_generated::PerformanceEvaluation<FlagField_T> const performance(blocks, flagFieldId, fluidFlagUID);
+   WcTimingPool timeloopTiming;
+   WcTimer simTimer;
+
+   simTimer.start();
+   timeloop.run(timeloopTiming, true);
+   simTimer.end();
+
+   double time = simTimer.max();
+   WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+   performance.logResultOnRoot(timesteps, time);
+
+   const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+   WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+
+   return EXIT_SUCCESS;
+}
+
+} // namespace walberla
+
+int main(int argc, char** argv) { return walberla::main(argc, argv); }
diff --git a/apps/benchmarks/PSM_Benchmark_Test/PSM_Benchmark_Test.py b/apps/benchmarks/PSM_Benchmark_Test/PSM_Benchmark_Test.py
new file mode 100644
index 0000000000000000000000000000000000000000..136d29a2b5b96992c8a73f1f5aeb38537fc4d7e7
--- /dev/null
+++ b/apps/benchmarks/PSM_Benchmark_Test/PSM_Benchmark_Test.py
@@ -0,0 +1,109 @@
+import copy
+import sympy as sp
+from sympy.core.add import Add
+from sympy.core.mul import Mul
+from sympy.codegen.ast import Assignment
+from dataclasses import replace
+
+import pystencils as ps
+
+from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil, ForceModel
+from lbmpy.partially_saturated_cells import PSMConfig
+
+from lbmpy.creationfunctions import create_lb_update_rule, create_lb_method, create_lb_collision_rule, create_psm_update_rule
+from lbmpy.macroscopic_value_kernels import macroscopic_values_setter, macroscopic_values_getter
+from lbmpy.boundaries import NoSlip, UBB, FixedDensity, SimpleExtrapolationOutflow
+
+from pystencils.node_collection import NodeCollection
+from pystencils.astnodes import Conditional, SympyAssignment, Block
+from pystencils import TypedSymbol
+
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+
+from pystencils.cache import clear_cache
+clear_cache()
+
+#   =====================
+#      Code Generation
+#   =====================
+
+
+with CodeGeneration() as ctx:
+    #   ========================
+    #      General Parameters
+    #   ========================
+    data_type = "float64" if ctx.double_accuracy else "float32"
+    stencil = LBStencil(Stencil.D3Q19)
+    omega = sp.Symbol('omega')
+    layout = 'fzyx'
+    if ctx.optimize_for_localhost:
+        cpu_vec = {"nontemporal": False, "assume_aligned": True}
+    else:
+        cpu_vec = None
+
+    openmp = True if ctx.openmp else False
+
+    #   PDF Fields
+    pdfs, pdfs_tmp = ps.fields(f'pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {data_type}[3D]', layout=layout)
+
+    #   Velocity Output Field
+    velocity = ps.fields(f"velocity({stencil.D}): {data_type}[3D]", layout=layout)
+    density = ps.fields(f"density({1}): {data_type}[3D]", layout=layout)
+    macroscopic_fields = {'density': density, 'velocity': velocity}
+
+    fraction_field = ps.fields(f"frac_field({1}): {data_type}[3D]", layout=layout,)
+    object_velocity_field = ps.fields(f"obj_vel({stencil.D}): {data_type}[3D]", layout=layout,)
+
+    psm_config = PSMConfig(
+        fraction_field=fraction_field,
+        object_velocity_field=object_velocity_field,
+        SC=1,
+    )
+
+    lbm_config = LBMConfig(
+        stencil=stencil,
+        method=Method.SRT,
+        relaxation_rate=omega,
+        compressible=False,
+        zero_centered=True,
+        psm_config=psm_config,
+    )
+
+    lbm_config_no_psm = replace(lbm_config, psm_config=None)
+
+    lbm_opt = LBMOptimisation(cse_global=True,
+                              symbolic_field=pdfs,
+                              symbolic_temporary_field=pdfs_tmp,
+                              field_layout=layout,
+                              simplification=True)
+
+
+
+    target = ps.Target.GPU if ctx.gpu else ps.Target.CPU
+
+    collision_rule = create_lb_collision_rule(
+        lbm_config=lbm_config,
+        lbm_optimisation=lbm_opt
+    )
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip', boundary_object=NoSlip())
+
+    generate_lbm_package(ctx, name="PSM_Benchmark_Test",
+                         collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt, boundaries=[no_slip],
+                         nonuniform=False, macroscopic_fields=macroscopic_fields,
+                         cpu_openmp=openmp, cpu_vectorize_info=cpu_vec, target=target)
+
+
+    field_typedefs = {'VectorField_T': velocity,
+                      'ScalarField_T': density}
+
+    psm_condition_update_rule = create_psm_update_rule(lbm_config, lbm_opt)
+
+    generate_sweep(ctx, "PSM_Conditional_Sweep", psm_condition_update_rule, field_swaps=[(pdfs, pdfs_tmp)],
+                   cpu_openmp=openmp, cpu_vectorize_info=cpu_vec, target=target)
+
+    generate_info_header(ctx, 'PSM_Benchmark_Test_InfoHeader',
+                         field_typedefs=field_typedefs)
+
+