diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 7a860fe95fc189f9fc8d7c415e93ddb3d350a597..187dd2c3185c625c066c0bf283e4875893547246 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -51,6 +51,7 @@ stages:
         -DWALBERLA_BUILD_WITH_METIS=$WALBERLA_BUILD_WITH_METIS
         -DWALBERLA_BUILD_WITH_PARMETIS=$WALBERLA_BUILD_WITH_PARMETIS
         -DWALBERLA_BUILD_WITH_FFTW=$WALBERLA_BUILD_WITH_FFTW
+        -DWALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT=$WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
         -DWALBERLA_ENABLE_GUI=$WALBERLA_ENABLE_GUI
         -DWALBERLA_BUILD_WITH_CODEGEN=$WALBERLA_BUILD_WITH_CODEGEN
         -DWALBERLA_STL_BOUNDS_CHECKS=$WALBERLA_STL_BOUNDS_CHECKS
@@ -96,7 +97,7 @@ stages:
 
 icc_2022_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022:22
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -112,7 +113,7 @@ icc_2022_serial:
 
 icc_2022_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022:22
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
@@ -126,7 +127,7 @@ icc_2022_mpionly:
 
 icc_2022_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022:22
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WARNING_ERROR: "OFF"
@@ -136,7 +137,7 @@ icc_2022_hybrid:
 
 icc_2022_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022:22
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -150,7 +151,7 @@ icc_2022_serial_dbg:
 
 icc_2022_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022:22
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
@@ -162,7 +163,7 @@ icc_2022_mpionly_dbg:
 
 icc_2022_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022:22
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
@@ -173,7 +174,7 @@ icc_2022_hybrid_dbg:
 
 icc_2022_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022:22
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
@@ -185,9 +186,9 @@ icc_2022_hybrid_dbg_sp:
       - cuda11
       - docker
 
-icx_2022_serial:
+icx_2023_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2022
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2023:22
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -200,9 +201,9 @@ icx_2022_serial:
       - cuda11
       - docker
 
-icx_2022_mpionly:
+icx_2023_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2022
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2023:22
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
@@ -213,18 +214,18 @@ icx_2022_mpionly:
       - cuda11
       - docker
 
-icx_2022_hybrid:
+icx_2023_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2022
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2023:22
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
    tags:
       - cuda11
       - docker
 
-icx_2022_serial_dbg:
+icx_2023_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2022
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2023:22
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -235,9 +236,9 @@ icx_2022_serial_dbg:
       - cuda11
       - docker
 
-icx_2022_mpionly_dbg:
+icx_2023_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2022
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2023:22
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
@@ -246,9 +247,9 @@ icx_2022_mpionly_dbg:
       - cuda11
       - docker
 
-icx_2022_hybrid_dbg:
+icx_2023_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2022
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2023:22
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
@@ -256,9 +257,9 @@ icx_2022_hybrid_dbg:
       - cuda11
       - docker
 
-icx_2022_hybrid_dbg_sp:
+icx_2023_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2022
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2023:22
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
@@ -269,16 +270,20 @@ icx_2022_hybrid_dbg_sp:
       - cuda11
       - docker
 
-gcc_9_serial:
+gcc_10_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-9
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -286,6 +291,7 @@ gcc_9_serial:
       WALBERLA_BUILD_WITH_PARMETIS: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -293,21 +299,26 @@ gcc_9_serial:
       - cuda11
       - docker
 
-gcc_9_mpionly:
+gcc_10_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-9
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -315,20 +326,25 @@ gcc_9_mpionly:
       - cuda11
       - docker
 
-gcc_9_hybrid:
+gcc_10_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-9
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -336,16 +352,20 @@ gcc_9_hybrid:
       - cuda11
       - docker
 
-gcc_9_serial_dbg:
+gcc_10_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-9
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -354,6 +374,7 @@ gcc_9_serial_dbg:
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -361,22 +382,27 @@ gcc_9_serial_dbg:
       - cuda11
       - docker
 
-gcc_9_mpionly_dbg:
+gcc_10_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-9
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -384,35 +410,44 @@ gcc_9_mpionly_dbg:
       - cuda11
       - docker
 
-gcc_9_hybrid_dbg:
+gcc_10_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-9
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    tags:
       - cuda11
       - docker
 
-gcc_9_hybrid_dbg_sp:
+gcc_10_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-9
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
@@ -421,6 +456,7 @@ gcc_9_hybrid_dbg_sp:
       WALBERLA_BUILD_WITH_METIS: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -428,16 +464,20 @@ gcc_9_hybrid_dbg_sp:
       - cuda11
       - docker
 
-gcc_10_serial:
+gcc_11_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -445,6 +485,7 @@ gcc_10_serial:
       WALBERLA_BUILD_WITH_PARMETIS: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -452,21 +493,26 @@ gcc_10_serial:
       - cuda11
       - docker
 
-gcc_10_mpionly:
+gcc_11_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -474,20 +520,25 @@ gcc_10_mpionly:
       - cuda11
       - docker
 
-gcc_10_hybrid:
+gcc_11_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -495,16 +546,20 @@ gcc_10_hybrid:
       - cuda11
       - docker
 
-gcc_10_serial_dbg:
+gcc_11_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -513,6 +568,7 @@ gcc_10_serial_dbg:
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -520,22 +576,27 @@ gcc_10_serial_dbg:
       - cuda11
       - docker
 
-gcc_10_mpionly_dbg:
+gcc_11_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -543,21 +604,26 @@ gcc_10_mpionly_dbg:
       - cuda11
       - docker
 
-gcc_10_hybrid_dbg:
+gcc_11_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -565,16 +631,20 @@ gcc_10_hybrid_dbg:
       - cuda11
       - docker
 
-gcc_10_hybrid_dbg_sp:
+gcc_11_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
@@ -583,6 +653,7 @@ gcc_10_hybrid_dbg_sp:
       WALBERLA_BUILD_WITH_METIS: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -590,16 +661,20 @@ gcc_10_hybrid_dbg_sp:
       - cuda11
       - docker
 
-gcc_11_serial:
+gcc_12_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -607,6 +682,7 @@ gcc_11_serial:
       WALBERLA_BUILD_WITH_PARMETIS: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -614,21 +690,26 @@ gcc_11_serial:
       - cuda11
       - docker
 
-gcc_11_mpionly:
+gcc_12_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -636,20 +717,25 @@ gcc_11_mpionly:
       - cuda11
       - docker
 
-gcc_11_hybrid:
+gcc_12_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -657,16 +743,20 @@ gcc_11_hybrid:
       - cuda11
       - docker
 
-gcc_11_serial_dbg:
+gcc_12_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -675,6 +765,7 @@ gcc_11_serial_dbg:
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -682,22 +773,27 @@ gcc_11_serial_dbg:
       - cuda11
       - docker
 
-gcc_11_mpionly_dbg:
+gcc_12_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -705,21 +801,26 @@ gcc_11_mpionly_dbg:
       - cuda11
       - docker
 
-gcc_11_hybrid_dbg:
+gcc_12_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -727,16 +828,20 @@ gcc_11_hybrid_dbg:
       - cuda11
       - docker
 
-gcc_11_hybrid_dbg_sp:
+gcc_12_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
@@ -745,6 +850,7 @@ gcc_11_hybrid_dbg_sp:
       WALBERLA_BUILD_WITH_METIS: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -752,14 +858,29 @@ gcc_11_hybrid_dbg_sp:
       - cuda11
       - docker
 
-gcc_12_serial:
+gcc_13_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13:22
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -767,12 +888,27 @@ gcc_12_serial:
       - cuda11
       - docker
 
-gcc_12_mpionly:
+gcc_13_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13:22
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -780,73 +916,152 @@ gcc_12_mpionly:
       - cuda11
       - docker
 
-gcc_12_hybrid:
+gcc_13_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13:22
    stage: pretest
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
    tags:
       - cuda11
       - docker
 
-gcc_12_serial_dbg:
+gcc_13_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13:22
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_PARMETIS: "OFF"
       CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
    tags:
       - cuda11
       - docker
 
-gcc_12_mpionly_dbg:
+gcc_13_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13:22
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
    tags:
       - cuda11
       - docker
 
-gcc_12_hybrid_dbg:
+gcc_13_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13:22
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
    tags:
       - cuda11
       - docker
 
-gcc_12_hybrid_dbg_sp:
+gcc_13_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13:22
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_DOUBLE_ACCURACY: "OFF"
       WALBERLA_BUILD_WITH_PARMETIS: "OFF"
       WALBERLA_BUILD_WITH_METIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
    tags:
       - cuda11
       - docker
 
-clang_12_serial:
+clang_14_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -854,6 +1069,7 @@ clang_12_serial:
       WALBERLA_BUILD_WITH_PARMETIS: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -861,21 +1077,26 @@ clang_12_serial:
       - cuda11
       - docker
 
-clang_12_mpionly:
+clang_14_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -883,20 +1104,25 @@ clang_12_mpionly:
       - cuda11
       - docker
 
-clang_12_hybrid:
+clang_14_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -904,16 +1130,20 @@ clang_12_hybrid:
       - cuda11
       - docker
 
-clang_12_serial_dbg:
+clang_14_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -922,6 +1152,7 @@ clang_12_serial_dbg:
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -929,22 +1160,27 @@ clang_12_serial_dbg:
       - cuda11
       - docker
 
-clang_12_mpionly_dbg:
+clang_14_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -952,35 +1188,44 @@ clang_12_mpionly_dbg:
       - cuda11
       - docker
 
-clang_12_hybrid_dbg:
+clang_14_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    tags:
       - cuda11
       - docker
 
-clang_12_hybrid_dbg_sp:
+clang_14_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
@@ -989,6 +1234,7 @@ clang_12_hybrid_dbg_sp:
       WALBERLA_BUILD_WITH_METIS: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -996,16 +1242,20 @@ clang_12_hybrid_dbg_sp:
       - cuda11
       - docker
 
-clang_13_serial:
+clang_15_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -1013,6 +1263,7 @@ clang_13_serial:
       WALBERLA_BUILD_WITH_PARMETIS: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -1020,21 +1271,26 @@ clang_13_serial:
       - cuda11
       - docker
 
-clang_13_mpionly:
+clang_15_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -1042,20 +1298,25 @@ clang_13_mpionly:
       - cuda11
       - docker
 
-clang_13_hybrid:
+clang_15_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -1063,16 +1324,20 @@ clang_13_hybrid:
       - cuda11
       - docker
 
-clang_13_serial_dbg:
+clang_15_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -1081,6 +1346,7 @@ clang_13_serial_dbg:
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -1088,22 +1354,27 @@ clang_13_serial_dbg:
       - cuda11
       - docker
 
-clang_13_mpionly_dbg:
+clang_15_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -1111,21 +1382,26 @@ clang_13_mpionly_dbg:
       - cuda11
       - docker
 
-clang_13_hybrid_dbg:
+clang_15_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -1133,16 +1409,20 @@ clang_13_hybrid_dbg:
       - cuda11
       - docker
 
-clang_13_hybrid_dbg_sp:
+clang_15_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
@@ -1151,6 +1431,7 @@ clang_13_hybrid_dbg_sp:
       WALBERLA_BUILD_WITH_METIS: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -1158,16 +1439,20 @@ clang_13_hybrid_dbg_sp:
       - cuda11
       - docker
 
-clang_14_serial:
+clang_16_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -1175,6 +1460,7 @@ clang_14_serial:
       WALBERLA_BUILD_WITH_PARMETIS: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -1182,21 +1468,26 @@ clang_14_serial:
       - cuda11
       - docker
 
-clang_14_mpionly:
+clang_16_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -1204,20 +1495,25 @@ clang_14_mpionly:
       - cuda11
       - docker
 
-clang_14_hybrid:
+clang_16_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -1225,16 +1521,20 @@ clang_14_hybrid:
       - cuda11
       - docker
 
-clang_14_serial_dbg:
+clang_16_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -1243,6 +1543,7 @@ clang_14_serial_dbg:
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -1250,22 +1551,27 @@ clang_14_serial_dbg:
       - cuda11
       - docker
 
-clang_14_mpionly_dbg:
+clang_16_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -1273,21 +1579,26 @@ clang_14_mpionly_dbg:
       - cuda11
       - docker
 
-clang_14_hybrid_dbg:
+clang_16_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -1295,16 +1606,20 @@ clang_14_hybrid_dbg:
       - cuda11
       - docker
 
-clang_14_hybrid_dbg_sp:
+clang_16_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
@@ -1313,6 +1628,7 @@ clang_14_hybrid_dbg_sp:
       WALBERLA_BUILD_WITH_METIS: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -1320,16 +1636,20 @@ clang_14_hybrid_dbg_sp:
       - cuda11
       - docker
 
-clang_15_serial:
+clang_17_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -1337,6 +1657,7 @@ clang_15_serial:
       WALBERLA_BUILD_WITH_PARMETIS: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -1344,21 +1665,26 @@ clang_15_serial:
       - cuda11
       - docker
 
-clang_15_mpionly:
+clang_17_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -1366,34 +1692,43 @@ clang_15_mpionly:
       - cuda11
       - docker
 
-clang_15_hybrid:
+clang_17_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    tags:
       - cuda11
       - docker
 
-clang_15_serial_dbg:
+clang_17_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
@@ -1402,60 +1737,75 @@ clang_15_serial_dbg:
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    tags:
       - cuda11
       - docker
 
-clang_15_mpionly_dbg:
+clang_17_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    tags:
       - cuda11
       - docker
 
-clang_15_hybrid_dbg:
+clang_17_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17:22
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    tags:
       - cuda11
       - docker
 
-clang_15_hybrid_dbg_sp:
+clang_17_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17:22
    stage: pretest
    before_script:
-      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
-      - pip3 list
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
       - cd ..
-      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
@@ -1464,57 +1814,194 @@ clang_15_hybrid_dbg_sp:
       WALBERLA_BUILD_WITH_METIS: "OFF"
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    tags:
       - cuda11
       - docker
 
+aocc_4_serial:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/aocc-4:22
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+   only:
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
+   tags:
+      - cuda11
+      - docker
 
+aocc_4_mpionly:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/aocc-4:22
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+   only:
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
+   tags:
+      - cuda11
+      - docker
 
-gcc_8_hybrid_dbg_noboost:
+aocc_4_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:8
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/aocc-4:22
    before_script:
-      - rm -rf /opt/boost /usr/include/boost
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
-      CMAKE_BUILD_TYPE: "DebugOptimized"
-      WALBERLA_BUILD_WITH_CUDA: "OFF"
-      WALBERLA_ENABLE_GUI: "OFF"
-      WALBERLA_BUILD_WITH_PYTHON: "OFF"
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    tags:
+      - cuda11
       - docker
 
+aocc_4_serial_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/aocc-4:22
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+   tags:
+      - cuda11
+      - docker
 
+aocc_4_mpionly_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/aocc-4:22
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+   tags:
+      - cuda11
+      - docker
 
-###############################################################################
-##                                                                           ##
-##    STL Debug Build                                                        ##
-##                                                                           ##
-###############################################################################
+aocc_4_hybrid_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/aocc-4:22
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+   tags:
+      - cuda11
+      - docker
 
-gcc_10_stl_debug:
+aocc_4_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-debug-stl:10
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/aocc-4:22
    before_script:
-      - rm -rf /opt/boost /usr/include/boost
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.4 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
    variables:
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "ON"
-      WALBERLA_BUILD_WITH_OPENMP: "ON"
-      OMP_NUM_THREADS: "4"
-      OMP_WAIT_POLICY: "PASSIVE"
+      WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
-      WALBERLA_BUFFER_DEBUG: "OFF"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-      WALBERLA_BUILD_WITH_CUDA: "OFF"
-      WALBERLA_BUILD_WITH_METIS: "OFF"
+      WALBERLA_DOUBLE_ACCURACY: "OFF"
       WALBERLA_BUILD_WITH_PARMETIS: "OFF"
-      WALBERLA_BUILD_WITH_PYTHON: "OFF"
-      WALBERLA_STL_BOUNDS_CHECKS: "ON"
+      WALBERLA_BUILD_WITH_METIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
    tags:
+      - cuda11
       - docker
 
 
-
 ###############################################################################
 ##                                                                           ##
 ##    Documentation                                                         ##
@@ -1522,7 +2009,7 @@ gcc_10_stl_debug:
 ###############################################################################
 
 doc:
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13
    script:
       - cmake --version
       - doxygen --version
@@ -1582,7 +2069,7 @@ cppcheck:
 
 
 coverage:
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13
    script:
       - pip3 install gcovr
       - export NUM_CORES=$(nproc --all)
@@ -1654,8 +2141,8 @@ coverage:
 mac_Serial_Dbg:
    extends: .mac_build_template
    before_script:
-     - pip3 install pystencils==1.3.3
-     - pip3 install lbmpy==1.3.3
+     - pip3 install pystencils==1.3.4
+     - pip3 install lbmpy==1.3.4
    variables:
       CMAKE_BUILD_TYPE: "DebugOptimized"
       CTEST_EXCLUDE_LABELS: "longrun"
@@ -1667,8 +2154,8 @@ mac_Serial_Dbg:
 mac_Serial:
    extends: .mac_build_template
    before_script:
-     - pip3 install pystencils==1.3.3
-     - pip3 install lbmpy==1.3.3
+     - pip3 install pystencils==1.3.4
+     - pip3 install lbmpy==1.3.4
    variables:
       CMAKE_BUILD_TYPE: "Release"
       CTEST_EXCLUDE_LABELS: "longrun"
@@ -1680,8 +2167,8 @@ mac_Serial:
 mac_MpiOnly_Dbg:
    extends: .mac_build_template
    before_script:
-     - pip3 install pystencils==1.3.3
-     - pip3 install lbmpy==1.3.3
+     - pip3 install pystencils==1.3.4
+     - pip3 install lbmpy==1.3.4
    variables:
       CMAKE_BUILD_TYPE: "DebugOptimized"
       CTEST_EXCLUDE_LABELS: "longrun"
@@ -1694,8 +2181,8 @@ mac_MpiOnly_Dbg:
 mac_MpiOnly:
    extends: .mac_build_template
    before_script:
-     - pip3 install pystencils==1.3.3
-     - pip3 install lbmpy==1.3.3
+     - pip3 install pystencils==1.3.4
+     - pip3 install lbmpy==1.3.4
    variables:
       CMAKE_BUILD_TYPE: "Release"
       CTEST_EXCLUDE_LABELS: "longrun"
@@ -1808,23 +2295,18 @@ conda-py36-linux:
 
 benchmark_intel19:
    <<: *benchmark_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:19
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel-2022
 
 benchmark_gcc8:
    <<: *benchmark_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:8
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13
 
 benchmark_clang8:
    <<: *benchmark_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:8.0
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17
 
 benchmark_ClangBuildAnalyzer:
   script:
-    - apt-get update --fix-missing
-    - apt-get -y install apt-transport-https ca-certificates gnupg software-properties-common wget
-    - wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
-    - apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
-    - apt-get -y install cmake ninja-build
     - cmake --version
     - ccache --version
     - mpirun --version
@@ -1845,7 +2327,7 @@ benchmark_ClangBuildAnalyzer:
     - ninja all
     - ClangBuildAnalyzer --stop . CBA
     - ClangBuildAnalyzer --analyze CBA
-  image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:9.0
+  image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17
   tags:
     - docker-benchmark
   only:
@@ -1871,5 +2353,6 @@ continuous_benchmark_trigger:
   rules:
     - if: '$CI_PROJECT_PATH == "walberla/walberla" && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH'
       when: on_success
-    - when: manual
+    - if: $CI_PIPELINE_SOURCE != "merge_request_event"
+      when: manual
       allow_failure: true
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4365195f6a3318d1fbca8764220ec4d40348c7d0..f157d599e12dd1c282cf302ee4864cff662da495 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,6 +99,7 @@ option ( WALBERLA_PROFILE_USE               "Uses Profile to optimize"
 option ( WALBERLA_OPTIMIZE_FOR_LOCALHOST    "Enable compiler optimizations spcific to localhost" )
 
 option ( WALBERLA_LOG_SKIPPED               "Log skipped cmake targets"                      ON  )
+option ( WALBERLA_DEPS_ERROR                "Fail if module dependencies are not met"        OFF )
 
 option ( WALBERLA_GIT_SUBMODULE_AUTO        "Check submodules during cmake run"               ON )
 
@@ -215,7 +216,7 @@ endif()
 mark_as_advanced ( WALBERLA_CXX_COMPILER_IS_IBM )
 
 # Check for NEC SX compiler
-if( CMAKE_CXX_COMPILER MATCHES "sxc" OR CMAKE_CXX_COMPILER_ARG1 MATCHES "sxc" OR CMAKE_CXX_COMPILER MATCHES "sxmpic" OR CMAKE_CXX_COMPILER_ARG1 MATCHES "sxmpic" )
+if( CMAKE_CXX_COMPILER MATCHES "/sxc" OR CMAKE_CXX_COMPILER_ARG1 MATCHES "/sxc" OR CMAKE_CXX_COMPILER MATCHES "/sxmpic" OR CMAKE_CXX_COMPILER_ARG1 MATCHES "/sxmpic" )
     option ( WALBERLA_CXX_COMPILER_IS_NEC "Use NEC compiler" ON  )
 else()
     option ( WALBERLA_CXX_COMPILER_IS_NEC "Use NEC compiler" OFF  )
@@ -927,7 +928,7 @@ if( (NOT DEFINED WALBERLA_BUILD_WITH_OPENMESH) OR WALBERLA_BUILD_WITH_OPENMESH )
          add_definitions(-D_USE_MATH_DEFINES )
       endif()
    else()
-      message("   If OpenMesh required, set OPENMESH_DIR to the OpenMesh directory.")
+      message("   If OpenMesh required, set OPENMESH_LIBRARY_DIR to the OpenMesh directory.")
       set( WALBERLA_BUILD_WITH_OPENMESH OFF CACHE BOOL "Build with OpenMesh support" FORCE )
    endif()
 endif()
@@ -1276,12 +1277,6 @@ if ( WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT )
       message( FATAL_ERROR "Compiler: ${CMAKE_CXX_COMPILER} Version: ${CMAKE_CXX_COMPILER_VERSION} does not support half precision" )
    endif ()
 
-   # Local host optimization
-   if ( NOT WALBERLA_OPTIMIZE_FOR_LOCALHOST )
-      message( WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] "
-            "You are not optimizing for localhost. You may encounter linker errors, or WORSE: silent incorrect fp16 arithmetic! Consider also enabling WALBERLA_OPTIMIZE_FOR_LOCALHOST!" )
-   endif () # Local host check
-
 endif () # Check if WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT is set
 
 ############################################################################################################################
diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt
index 8abdee2a5b60b9f5c249f92351ea43220ea3309d..dec830b7e286202976ab3fc702ec1ae1a884260a 100644
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -28,6 +28,7 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
       add_subdirectory( UniformGridCPU )
       add_subdirectory( PhaseFieldAllenCahn )
       add_subdirectory( NonUniformGridCPU )
+      add_subdirectory( TurbulentChannel )
    endif()
 
    if ( WALBERLA_BUILD_WITH_CODEGEN AND WALBERLA_BUILD_WITH_GPU_SUPPORT )
diff --git a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
index 6476d82ec1877b7f2732837d89cf76a4d81b2627..34fde13e360e3190c3ec811b0f810432f47adf00 100644
--- a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
+++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
@@ -132,7 +132,7 @@ int main(int argc, char** argv)
       SweepCollection_T sweepCollection(blocks, pdfFieldID, densityFieldID, velFieldID, omega, innerOuterSplit);
       for (auto& block : *blocks)
       {
-         sweepCollection.initialise(&block, 2);
+         sweepCollection.initialise(&block, cell_idx_c(1));
       }
       WALBERLA_MPI_BARRIER()
       WALBERLA_LOG_INFO_ON_ROOT("Initialisation done")
@@ -171,11 +171,13 @@ int main(int argc, char** argv)
 
       // VTK
       const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      const bool useVTKAMRWriter = parameters.getParameter< bool >("useVTKAMRWriter", false);
+      const bool oneFilePerProcess = parameters.getParameter< bool >("oneFilePerProcess", false);
       if (vtkWriteFrequency > 0)
       {
          auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
-                                                         "simulation_step", false, true, true, false, 0);
-         auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldID, "vel");
+                                                         "simulation_step", false, true, true, false, 0, useVTKAMRWriter, oneFilePerProcess);
+         auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(velFieldID, "vel");
          vtkOutput->addCellDataWriter(velWriter);
 
          vtkOutput->addBeforeFunction([&]() {
diff --git a/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py
index 8c99c62c2fc7c3791239b417b813fa6a1d62f1cd..51a0220b02b9316053f1e2f175d8e2d1aaf017b2 100644
--- a/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py
@@ -6,6 +6,7 @@ import sys
 
 DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3")
 
+
 class Scenario:
     def __init__(self,
                  domain_size=(64, 64, 64),
@@ -57,13 +58,15 @@ class Scenario:
                 'timesteps': self.timesteps,
                 'remainingTimeLoggerFrequency': self.logger_frequency,
                 'vtkWriteFrequency': self.vtk_write_frequency,
+                'useVTKAMRWriter': True,
+                'oneFilePerProcess': False
             },
             'Logging': {
                 'logLevel': "info",
             }
         }
 
-        if(print_dict):
+        if (print_dict):
             wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
 
         return config_dict
@@ -117,6 +120,7 @@ def validation_run():
                         write_setup_vtk=True)
     scenarios.add(scenario)
 
+
 def scaling():
     wlb.log_info_on_root("Running scaling benchmark...")
 
@@ -134,5 +138,6 @@ def scaling():
                         timesteps=10)
     scenarios.add(scenario)
 
+
 validation_run()
 # scaling()
diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
index 919755d6d7cd481dd90a2f1310965e0c6947432c..233103342cdde4e21ab443cd3aadbece48b42c85 100644
--- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
@@ -190,7 +190,7 @@ int main(int argc, char** argv)
       SweepCollection_T sweepCollection(blocks, pdfFieldGpuID, densityFieldGpuID, velFieldGpuID, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], omega, innerOuterSplit);
       for (auto& iBlock : *blocks)
       {
-         sweepCollection.initialise(&iBlock, 2, nullptr);
+         sweepCollection.initialise(&iBlock, cell_idx_c(1), nullptr);
       }
       WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
       WALBERLA_GPU_CHECK(gpuPeekAtLastError())
@@ -243,7 +243,7 @@ int main(int argc, char** argv)
       {
          auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
                                                          "simulation_step", false, true, true, false, 0);
-         auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldCpuID, "vel");
+         auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(velFieldCpuID, "vel");
          vtkOutput->addCellDataWriter(velWriter);
 
          vtkOutput->addBeforeFunction([&]() {
diff --git a/apps/benchmarks/TurbulentChannel/CMakeLists.txt b/apps/benchmarks/TurbulentChannel/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..73aa9ef56addc086f49cb4b533923a00ebce1018
--- /dev/null
+++ b/apps/benchmarks/TurbulentChannel/CMakeLists.txt
@@ -0,0 +1,26 @@
+waLBerla_link_files_to_builddir( *.prm )
+
+if( WALBERLA_BUILD_WITH_CODEGEN )
+
+    #   Turbulent Channel generation
+    walberla_generate_target_from_python( NAME TurbulentChannel_CodeGeneration
+            FILE TurbulentChannel.py
+            OUT_FILES CodegenIncludes.h
+            TurbulentChannel_Sweep.cpp TurbulentChannel_Sweep.h
+            TurbulentChannel_PackInfo.cpp TurbulentChannel_PackInfo.h
+            TurbulentChannel_Setter.cpp TurbulentChannel_Setter.h
+            TurbulentChannel_NoSlip.cpp TurbulentChannel_NoSlip.h
+            TurbulentChannel_FreeSlip_top.cpp TurbulentChannel_FreeSlip_top.h
+            TurbulentChannel_WFB_bottom.cpp TurbulentChannel_WFB_bottom.h
+            TurbulentChannel_WFB_top.cpp TurbulentChannel_WFB_top.h
+            TurbulentChannel_Welford.cpp TurbulentChannel_Welford.h
+            TurbulentChannel_Welford_TKE_SGS.cpp TurbulentChannel_Welford_TKE_SGS.h
+            TurbulentChannel_TKE_SGS_Writer.cpp TurbulentChannel_TKE_SGS_Writer.h
+    )
+
+    walberla_add_executable ( NAME TurbulentChannel_Application
+            FILES TurbulentChannel.cpp
+            DEPENDS blockforest core domain_decomposition field geometry timeloop lbm stencil vtk
+                    TurbulentChannel_CodeGeneration )
+
+endif()
\ No newline at end of file
diff --git a/apps/benchmarks/TurbulentChannel/TurbulentChannel.cpp b/apps/benchmarks/TurbulentChannel/TurbulentChannel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bd31d1263bc735ccdbd8d096c22057491f4d545b
--- /dev/null
+++ b/apps/benchmarks/TurbulentChannel/TurbulentChannel.cpp
@@ -0,0 +1,931 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TurbulentChannel.cpp
+//! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
+//
+//======================================================================================================================
+
+#include <memory>
+#include <cmath>
+#include <string>
+#include <iostream>
+
+#include <blockforest/all.h>
+#include <core/all.h>
+#include <domain_decomposition/all.h>
+#include <field/all.h>
+#include <field/vtk/VTKWriter.h>
+#include <geometry/all.h>
+#include <timeloop/all.h>
+#include <lbm/all.h>
+
+//    Codegen Includes
+#include "CodegenIncludes.h"
+
+namespace walberla {
+   ///////////////////////
+   /// Typedef Aliases ///
+   ///////////////////////
+
+   using Stencil_T = codegen::Stencil_T;
+
+   // Communication Pack Info
+   using PackInfo_T = pystencils::TurbulentChannel_PackInfo;
+
+   // PDF field type
+   using PdfField_T = field::GhostLayerField< real_t, Stencil_T::Size >;
+
+   // Field Types
+   using ScalarField_T = field::GhostLayerField< real_t, 1 >;
+   using VectorField_T = field::GhostLayerField< real_t, Stencil_T::D >;
+   using TensorField_T = field::GhostLayerField< real_t, Stencil_T::D*Stencil_T::D >;
+
+   using Setter_T = pystencils::TurbulentChannel_Setter;
+
+   using StreamCollideSweep_T = pystencils::TurbulentChannel_Sweep;
+   using WelfordSweep_T = pystencils::TurbulentChannel_Welford;
+   using TKEWelfordSweep_T = pystencils::TurbulentChannel_Welford_TKE_SGS;
+
+   using TkeSgsWriter_T = pystencils::TurbulentChannel_TKE_SGS_Writer;
+
+   // Boundary Handling
+   using flag_t = uint8_t;
+   using FlagField_T = FlagField< flag_t >;
+   using NoSlip_T = lbm::TurbulentChannel_NoSlip;
+   using FreeSlip_top_T = lbm::TurbulentChannel_FreeSlip_top;
+   using WFB_bottom_T = lbm::TurbulentChannel_WFB_bottom;
+   using WFB_top_T = lbm::TurbulentChannel_WFB_top;
+
+   /// DEAN CORRELATIONS
+
+   namespace dean_correlation {
+
+      real_t calculateFrictionReynoldsNumber(const real_t reynoldsBulk) {
+         return std::pow(0.073_r / 8_r, 1_r / 2_r) * std::pow(reynoldsBulk, 7_r / 8_r);
+      }
+
+      real_t calculateBulkReynoldsNumber(const real_t reynoldsFriction) {
+         return std::pow(8_r / 0.073_r, 4_r / 7_r) * std::pow(reynoldsFriction, 8_r / 7_r);
+      }
+   } // namespace dean_correlation
+
+
+   /// VELOCITY FIELD INITIALISATION
+
+   /*
+    * Initialises the velocity field with a logarithmic profile and sinusoidal perturbations to trigger turbulence.
+    * This initialisation is provided by Henrik Asmuth.
+    */
+   template<typename VelocityField_T>
+   void setVelocityFieldsAsmuth( const std::weak_ptr<StructuredBlockStorage>& forest,
+                                 const BlockDataID & velocityFieldId, const BlockDataID & meanVelocityFieldId,
+                                 const real_t frictionVelocity, const uint_t channel_half_width,
+                                 const real_t B, const real_t kappa, const real_t viscosity,
+                                 const uint_t wallAxis, const uint_t flowAxis ) {
+
+      auto blocks = forest.lock();
+      WALBERLA_CHECK_NOT_NULLPTR(blocks)
+
+      const auto domainSize = blocks->getDomain().max();
+      const auto delta = real_c(channel_half_width);
+      const auto remAxis = 3 - wallAxis - flowAxis;
+
+      for( auto block = blocks->begin(); block != blocks->end(); ++block ) {
+
+         auto * velocityField = block->template getData<VelocityField_T>(velocityFieldId);
+         WALBERLA_CHECK_NOT_NULLPTR(velocityField)
+
+         auto * meanVelocityField = block->template getData<VelocityField_T>(meanVelocityFieldId);
+         WALBERLA_CHECK_NOT_NULLPTR(meanVelocityField)
+
+         const auto ci = velocityField->xyzSizeWithGhostLayer();
+         for(auto cellIt = ci.begin(); cellIt != ci.end(); ++cellIt) {
+
+            Cell globalCell(*cellIt);
+            blocks->transformBlockLocalToGlobalCell(globalCell, *block);
+            Vector3<real_t> cellCenter;
+            blocks->getCellCenter(cellCenter, globalCell);
+
+            const auto y = cellCenter[wallAxis];
+            const auto rel_x = cellCenter[flowAxis] / domainSize[flowAxis];
+            const auto rel_z = cellCenter[remAxis] / domainSize[remAxis];
+
+            const real_t pos = std::max(delta - std::abs(y - delta - 1_r), 0.05_r);
+            const auto rel_y = pos / delta;
+
+            auto initialVel = frictionVelocity * (std::log(frictionVelocity * pos / viscosity) / kappa + B);
+
+            Vector3<real_t> vel;
+            vel[flowAxis] = initialVel;
+
+            vel[remAxis] = 2_r * frictionVelocity / kappa * std::sin(math::pi * 16_r * rel_x) *
+                           std::sin(math::pi * 8_r * rel_y) / (std::pow(rel_y, 2_r) + 1_r);
+
+            vel[wallAxis] = 8_r * frictionVelocity / kappa *
+                            (std::sin(math::pi * 8_r * rel_z) * std::sin(math::pi * 8_r * rel_y) +
+                             std::sin(math::pi * 8_r * rel_x)) / (std::pow(0.5_r * delta - pos, 2_r) + 1_r);
+
+            for(uint_t d = 0; d < 3; ++d) {
+               velocityField->get(*cellIt, d) = vel[d];
+               meanVelocityField->get(*cellIt, d) = vel[d];
+            }
+
+         }
+      }
+
+   } // function setVelocityFieldsHenrik
+
+   /// SIMULATION PARAMETERS
+
+   struct SimulationParameters {
+
+      SimulationParameters(const Config::BlockHandle & config)
+      {
+         channelHalfWidth = config.getParameter<uint_t>("channel_half_width");
+         fullChannel = config.getParameter<bool>("full_channel", false);
+
+         /// TARGET QUANTITIES
+
+         targetFrictionReynolds = config.getParameter<real_t>("target_friction_reynolds");
+         targetBulkVelocity = config.getParameter<real_t>("target_bulk_velocity", 0.05_r);
+
+         targetBulkReynolds = dean_correlation::calculateBulkReynoldsNumber(targetFrictionReynolds);
+         viscosity = 2_r * real_c(channelHalfWidth) * targetBulkVelocity / targetBulkReynolds;
+         targetFrictionVelocity = targetFrictionReynolds * viscosity / real_c(channelHalfWidth);
+
+         /// TIMESTEPS
+
+         timesteps = config.getParameter<uint_t>("timesteps", 0);
+         const uint_t turnoverPeriods = config.getParameter<uint_t>("turnover_periods", 0);
+
+         WALBERLA_ASSERT((timesteps != 0) != (turnoverPeriods != 0),
+                         "Either timesteps OR turnover periods must be given.")
+
+         if(turnoverPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            timesteps = turnoverPeriods * uint_c((real_c(channelHalfWidth) / targetFrictionVelocity));
+         }
+
+         /// DOMAIN DEFINITIONS
+
+         // obtained from codegen script -> adapt there
+         wallAxis = codegen::wallAxis;
+         flowAxis = codegen::flowAxis;
+
+         uint_t sizeFlowAxis      = config.getParameter<uint_t>("size_flow_axis", 0);
+         uint_t sizeRemainingAxis = config.getParameter<uint_t>("size_remaining_axis", 0);
+
+         WALBERLA_ASSERT_NOT_IDENTICAL(wallAxis, flowAxis, "Wall and flow axis must be different.")
+
+         const auto sizeFactor = channelHalfWidth / uint_t(10);
+         if( !sizeFlowAxis) sizeFlowAxis = sizeFactor * 64;
+         if( !sizeRemainingAxis) sizeRemainingAxis = sizeFactor * 32;
+
+         domainSize[wallAxis] = fullChannel ? 2 * channelHalfWidth : channelHalfWidth;
+         domainSize[flowAxis] = sizeFlowAxis;
+         domainSize[3- wallAxis - flowAxis] = sizeRemainingAxis;
+
+         periodicity[wallAxis] = false;
+
+         boundaryCondition = config.getParameter<std::string>("wall_boundary_condition", "WFB");
+
+         /// OUTPUT
+
+         auto tsPerPeriod = uint_c((real_c(channelHalfWidth) / targetFrictionVelocity));
+
+         vtkFrequency = config.getParameter<uint_t>("vtk_frequency", 0);
+         vtkStart = config.getParameter<uint_t>("vtk_start", 0);
+         plotFrequency = config.getParameter<uint_t>("plot_frequency", 0);
+         plotStart = config.getParameter<uint_t>("plot_start", 0);
+
+         // vtk start
+         vtkStart = config.getParameter<uint_t>("vtk_start_timesteps", 0);
+         const uint_t vtkStartPeriods = config.getParameter<uint_t>("vtk_start_periods", 0);
+
+         if(vtkStart || vtkStartPeriods) {
+            WALBERLA_ASSERT((vtkStart != 0) != (vtkStartPeriods != 0),
+                            "VTK start must be given in timesteps OR periods, not both.")
+         }
+
+         if(vtkStartPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            vtkStart = vtkStartPeriods * tsPerPeriod;
+         }
+
+         // plot start
+         plotStart = config.getParameter<uint_t>("plot_start_timesteps", 0);
+         const uint_t plotStartPeriods = config.getParameter<uint_t>("plot_start_periods", 0);
+
+         if(plotStart || plotStartPeriods) {
+            WALBERLA_ASSERT((plotStart != 0) != (plotStartPeriods != 0),
+                            "Plotting start must be given in timesteps OR periods, not both.")
+         }
+
+         if(plotStartPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            plotStart = plotStartPeriods * tsPerPeriod;
+         }
+
+         // frequencies
+         if(plotFrequency) {
+            timesteps = uint_c(std::ceil(real_c(timesteps) / real_c(plotFrequency))) * plotFrequency;
+         }
+
+         // sampling start & interval
+         samplingStart = config.getParameter<uint_t>("sampling_start_timesteps", 0);
+         const uint_t samplingStartPeriods = config.getParameter<uint_t>("sampling_start_periods", 0);
+
+         if(samplingStart || samplingStartPeriods) {
+            WALBERLA_ASSERT((samplingStart != 0) != (samplingStartPeriods != 0),
+                            "Sampling start must be given in timesteps OR periods, not both.")
+         }
+
+         if(samplingStartPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            samplingStart = samplingStartPeriods * tsPerPeriod;
+         }
+
+         samplingInterval = config.getParameter<uint_t>("sampling_interval_timesteps", 0);
+         const uint_t samplingIntervalPeriods = config.getParameter<uint_t>("sampling_interval_periods", 0);
+
+         if(samplingInterval || samplingIntervalPeriods) {
+            WALBERLA_ASSERT((samplingInterval != 0) != (samplingIntervalPeriods != 0),
+                            "Sampling start must be given in timesteps OR periods, not both.")
+         }
+
+         if(samplingStartPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            samplingInterval = samplingIntervalPeriods * tsPerPeriod;
+         }
+
+         timesteps += 1;
+
+      }
+
+      uint_t channelHalfWidth{};
+      bool fullChannel{};
+      Vector3<uint_t> domainSize{};
+      Vector3<uint_t> periodicity{true};
+
+      real_t targetFrictionReynolds{};
+      real_t targetBulkReynolds{};
+      real_t targetFrictionVelocity{};
+      real_t targetBulkVelocity{};
+
+      real_t viscosity{};
+      const real_t density{1.0};
+
+      uint_t timesteps{};
+
+      std::string boundaryCondition{};
+
+      uint_t wallAxis{};
+      uint_t flowAxis{};
+
+      /// output
+      uint_t vtkFrequency{};
+      uint_t vtkStart{};
+      uint_t plotFrequency{};
+      uint_t plotStart{};
+      uint_t samplingStart{};
+      uint_t samplingInterval{};
+
+   };
+
+
+   namespace boundaries {
+      void createBoundaryConfig(const SimulationParameters & parameters, Config::Block & boundaryBlock) {
+
+         auto & bottomWall = boundaryBlock.createBlock("Border");
+         bottomWall.addParameter("direction", stencil::dirToString[stencil::directionFromAxis(parameters.wallAxis, true)]);
+         bottomWall.addParameter("walldistance", "-1");
+         if(parameters.boundaryCondition == "NoSlip") {
+            bottomWall.addParameter("flag", "NoSlip");
+         } else if(parameters.boundaryCondition == "WFB") {
+            bottomWall.addParameter("flag", "WFB_bottom");
+         }
+
+         auto & topWall = boundaryBlock.createBlock("Border");
+         topWall.addParameter("direction", stencil::dirToString[stencil::directionFromAxis(parameters.wallAxis, false)]);
+         topWall.addParameter("walldistance", "-1");
+         if(parameters.fullChannel) {
+            if (parameters.boundaryCondition == "NoSlip") {
+               topWall.addParameter("flag", "NoSlip");
+            } else if (parameters.boundaryCondition == "WFB") {
+               topWall.addParameter("flag", "WFB_top");
+            }
+         } else {
+            topWall.addParameter("flag", "FreeSlip");
+         }
+
+      }
+   }
+
+   /// BULK VELOCITY CALCULATION
+   template< typename VelocityField_T >
+   class ForceCalculator {
+
+    public:
+      ForceCalculator(const std::weak_ptr<StructuredBlockStorage> & blocks, const BlockDataID meanVelocityId,
+                      const SimulationParameters & parameter)
+         : blocks_(blocks), meanVelocityId_(meanVelocityId), channelHalfWidth_(real_c(parameter.channelHalfWidth)),
+           targetBulkVelocity_(parameter.targetBulkVelocity), targetFrictionVelocity_(parameter.targetFrictionVelocity)
+      {
+         const auto & domainSize = parameter.domainSize;
+
+         Cell maxCell;
+         maxCell[parameter.wallAxis] = int_c(parameter.channelHalfWidth) - 1;
+         maxCell[flowDirection_] = int_c(domainSize[flowDirection_]) - 1;
+         const auto remainingIdx = 3 - parameter.wallAxis - flowDirection_;
+         maxCell[remainingIdx] = int_c(domainSize[remainingIdx]) - 1;
+         ci_ = CellInterval(Cell{}, maxCell);
+
+         numCells_ = real_c(parameter.channelHalfWidth * domainSize[flowDirection_] * domainSize[remainingIdx]);
+      }
+
+      real_t bulkVelocity() const { return bulkVelocity_; }
+      void setBulkVelocity(const real_t bulkVelocity) { bulkVelocity_ = bulkVelocity; }
+
+      void calculateBulkVelocity() {
+
+         // reset bulk velocity
+         bulkVelocity_ = 0_r;
+
+         auto blocks = blocks_.lock();
+         WALBERLA_CHECK_NOT_NULLPTR(blocks)
+
+         for( auto block = blocks->begin(); block != blocks->end(); ++block) {
+
+            auto * meanVelocityField = block->template getData<VelocityField_T>(meanVelocityId_);
+            WALBERLA_CHECK_NOT_NULLPTR(meanVelocityField)
+
+            auto fieldSize = meanVelocityField->xyzSize();
+            CellInterval localCi;
+            blocks->transformGlobalToBlockLocalCellInterval(localCi, *block, ci_);
+            fieldSize.intersect(localCi);
+
+            auto * slicedField = meanVelocityField->getSlicedField(fieldSize);
+            WALBERLA_CHECK_NOT_NULLPTR(meanVelocityField)
+
+            for(auto fieldIt = slicedField->beginXYZ(); fieldIt != slicedField->end(); ++fieldIt) {
+               const auto localMean = fieldIt[flowDirection_];
+               bulkVelocity_ += localMean;
+            }
+
+         }
+
+         mpi::allReduceInplace< real_t >(bulkVelocity_, mpi::SUM);
+         bulkVelocity_ /= numCells_;
+
+      }
+
+      real_t calculateDrivingForce() const {
+
+         // forcing term as in Malaspinas (2014) "Wall model for large-eddy simulation based on the lattice Boltzmann method"
+         const auto force = targetFrictionVelocity_ * targetFrictionVelocity_ / channelHalfWidth_
+                            + (targetBulkVelocity_ - bulkVelocity_) * targetBulkVelocity_ / channelHalfWidth_;
+
+         return force;
+      }
+
+    private:
+      const std::weak_ptr<StructuredBlockStorage> blocks_{};
+      const BlockDataID meanVelocityId_{};
+
+      const uint_t flowDirection_{};
+      const real_t channelHalfWidth_{};
+      const real_t targetBulkVelocity_{};
+      const real_t targetFrictionVelocity_{};
+
+      CellInterval ci_{};
+
+      real_t numCells_{};
+      real_t bulkVelocity_{};
+   };
+
+   template< typename Welford_T >
+   class TurbulentChannelPlotter {
+
+    public:
+      TurbulentChannelPlotter(SimulationParameters const * const parameters, Timeloop * const timeloop,
+                              ForceCalculator<VectorField_T> const * const forceCalculator,
+                              const std::weak_ptr<StructuredBlockStorage> & blocks,
+                              const BlockDataID velocityFieldId, const BlockDataID meanVelocityFieldId,
+                              const BlockDataID meanTkeSGSFieldId, Welford_T * velocityWelford,
+                              const bool separateFile = false)
+         : parameters_(parameters), forceCalculator_(forceCalculator), timeloop_(timeloop), blocks_(blocks),
+           velocityWelford_(velocityWelford), velocityFieldId_(velocityFieldId), meanVelocityFieldId_(meanVelocityFieldId),
+           meanTkeSGSFieldId_(meanTkeSGSFieldId), plotFrequency_(parameters->plotFrequency), plotStart_(parameters->plotStart),
+           separateFiles_(separateFile)
+      {
+         if(!plotFrequency_)
+            return;
+
+         // prepare output folder
+         const filesystem::path path(baseFolder_);
+         std::string fileSuffix = parameters->boundaryCondition + "_";
+
+         if(parameters->fullChannel)
+            fileSuffix += "full_D";
+         else
+            fileSuffix += "half_D";
+
+         fileSuffix += std::to_string(parameters->channelHalfWidth) + "_Re" +
+                       std::to_string(int(parameters->targetFrictionReynolds)) ;
+
+         velocityProfilesFilePath_ = path / ("velocity_profiles_" + fileSuffix);
+         forcingDataFilePath_ = path / ("forcing_data_" + fileSuffix + "_t" +
+                                        std::to_string(parameters->timesteps-1) + ".txt");
+
+         WALBERLA_ROOT_SECTION() {
+            // create directory if not existent; empty if existent
+            if( !filesystem::exists(path) ) {
+               filesystem::create_directories(path);
+            } else {
+               for (const auto& entry : filesystem::directory_iterator(path))
+                  std::filesystem::remove_all(entry.path());
+            }
+         }
+
+         // write force header
+         std::ofstream os (forcingDataFilePath_, std::ios::out | std::ios::trunc);
+         if(os.is_open()) {
+            os << "# timestep\t bulk_velocity\t driving_force\n";
+            os.close();
+         } else {
+            WALBERLA_ABORT("Could not open forcing data file.")
+         }
+
+      }
+
+      void operator()() {
+
+         const auto ts = timeloop_->getCurrentTimeStep();
+         if(ts < plotStart_)
+            return;
+
+         if(!plotFrequency_ || (ts % plotFrequency_))
+            return;
+
+         const auto channelHalfWidth = real_c(parameters_->channelHalfWidth);
+         const auto bulkVelocity = forceCalculator_->bulkVelocity();
+
+         /// write force data
+
+         WALBERLA_ROOT_SECTION() {
+            std::ofstream forceOS(forcingDataFilePath_, std::ios::out | std::ios::app);
+            if (forceOS.is_open())
+            {
+               forceOS << ts << "\t" << bulkVelocity << "\t" << forceCalculator_->calculateDrivingForce() << "\n";
+               forceOS.close();
+            }
+         }
+
+         /// write velocity data
+
+         // gather velocity data
+         std::vector<real_t> instantaneousVelocityVector(parameters_->channelHalfWidth, 0_r);
+         std::vector<real_t> meanVelocityVector(parameters_->channelHalfWidth, 0_r);
+         std::vector<real_t> tkeSGSVector(parameters_->channelHalfWidth, 0_r);
+         std::vector<real_t> tkeResolvedVector(parameters_->channelHalfWidth, 0_r);
+         std::vector<real_t> reynoldsStressVector(parameters_->channelHalfWidth * TensorField_T::F_SIZE, 0_r);
+
+         const auto idxFlow = int_c(parameters_->domainSize[parameters_->flowAxis] / uint_t(2));
+         const auto idxRem = int_c(parameters_->domainSize[3 - parameters_->flowAxis - parameters_->wallAxis] / uint_t(2));
+
+         Cell point;
+         point[parameters_->flowAxis] = idxFlow;
+         point[3 - parameters_->flowAxis - parameters_->wallAxis] = idxRem;
+
+         const auto flowAxis = int_c(parameters_->flowAxis);
+
+         auto blocks = blocks_.lock();
+         WALBERLA_CHECK_NOT_NULLPTR(blocks)
+
+         for(auto block = blocks->begin(); block != blocks->end(); ++block) {
+
+            const auto * const velocity = block->template getData<VectorField_T>(velocityFieldId_);
+            WALBERLA_CHECK_NOT_NULLPTR(velocity)
+
+            const auto * const meanVelocity = block->template getData<VectorField_T>(meanVelocityFieldId_);
+            WALBERLA_CHECK_NOT_NULLPTR(meanVelocity)
+
+            const auto * const tkeSGS = block->template getData<ScalarField_T>(meanTkeSGSFieldId_);
+            WALBERLA_CHECK_NOT_NULLPTR(tkeSGS)
+
+            const auto * const sop = block->template getData<TensorField_T>(velocityWelford_->sum_of_productsID);
+            WALBERLA_CHECK_NOT_NULLPTR(sop)
+
+            for(uint_t idx = 0; idx < parameters_->channelHalfWidth; ++idx) {
+
+               point[parameters_->wallAxis] = int_c(idx);
+
+               Cell localCell;
+               blocks->transformGlobalToBlockLocalCell(localCell, *block, point);
+
+               if(velocity->xyzSize().contains(localCell)){
+                  instantaneousVelocityVector[idx] = velocity->get(localCell, flowAxis);
+                  meanVelocityVector[idx] = meanVelocity->get(localCell, flowAxis);
+                  tkeSGSVector[idx] = tkeSGS->get(localCell);
+                  for(uint_t i = 0; i < TensorField_T::F_SIZE; ++i) {
+                     reynoldsStressVector[idx*TensorField_T::F_SIZE+i] = sop->get(localCell,i) / velocityWelford_->counter_;
+                  }
+                  tkeResolvedVector[idx] = real_c(0.5) * (
+                     reynoldsStressVector[idx*TensorField_T::F_SIZE+0] +
+                     reynoldsStressVector[idx*TensorField_T::F_SIZE+4] +
+                     reynoldsStressVector[idx*TensorField_T::F_SIZE+8]
+                  );
+               }
+            }
+         }
+
+         // MPI exchange information
+         mpi::reduceInplace(instantaneousVelocityVector, mpi::SUM);
+         mpi::reduceInplace(meanVelocityVector, mpi::SUM);
+         mpi::reduceInplace(tkeSGSVector, mpi::SUM);
+         mpi::reduceInplace(tkeResolvedVector, mpi::SUM);
+         mpi::reduceInplace(reynoldsStressVector, mpi::SUM);
+
+         WALBERLA_ROOT_SECTION()
+         {
+            std::ofstream velocityOS;
+            filesystem::path path = velocityProfilesFilePath_;
+            if (separateFiles_) {
+               path.concat("_t" + std::to_string(timeloop_->getCurrentTimeStep()) + ".txt");
+               velocityOS.open(path, std::ios::out | std::ios::trunc);
+            } else {
+               path.concat("_t" + std::to_string(parameters_->timesteps-1) + ".txt");
+               velocityOS.open(path, std::ios::out | std::ios::trunc);
+            }
+
+            if (velocityOS.is_open()) {
+               if (!separateFiles_) velocityOS << "# t = " << ts << "\n";
+               velocityOS << "# y/delta\t y+\t u+\t u_mean\t u_instantaneous\t TKE_SGS\t TKE_resolved\t uu_rms\t uv_rms\t uw_rms\t vu_rms\t vv_rms\t vw_rms\t wu_rms\t wv_rms\t ww_rms\n";
+
+               const auto & viscosity = parameters_->viscosity;
+               const auto bulkReynolds = 2_r * channelHalfWidth * bulkVelocity / viscosity;
+               const auto frictionReynolds = dean_correlation::calculateFrictionReynoldsNumber(bulkReynolds);
+               const auto frictionVelocity = frictionReynolds * viscosity / channelHalfWidth;
+
+               for(uint_t idx = 0; idx < parameters_->channelHalfWidth; ++idx) {
+                  // relative position
+                  velocityOS << (real_c(idx)+0.5_r) / channelHalfWidth << "\t";
+                  // y+
+                  velocityOS << (real_c(idx)+0.5_r) * frictionVelocity / viscosity << "\t";
+                  // u+
+                  velocityOS << meanVelocityVector[idx] / frictionVelocity << "\t";
+                  // mean velocity
+                  velocityOS << meanVelocityVector[idx] << "\t";
+                  // instantaneous velocity
+                  velocityOS << instantaneousVelocityVector[idx] << "\t";
+                  // subgrid-scale TKE
+                  velocityOS << tkeSGSVector[idx] << "\t";
+                  // resolved TKE
+                  velocityOS << tkeResolvedVector[idx] << "\t";
+                  // Reynolds stresses
+                  for(uint_t i = 0; i < TensorField_T::F_SIZE; ++i) {
+                     velocityOS << reynoldsStressVector[idx*TensorField_T::F_SIZE+i] << "\t";
+                  }
+
+                  velocityOS << "\n";
+               }
+               velocityOS.close();
+            } else{
+               WALBERLA_ABORT("Could not open velocity plot file " << path.generic_string())
+            }
+         }
+      }
+
+    private:
+
+
+      SimulationParameters const * const parameters_{};
+      ForceCalculator<VectorField_T> const * const forceCalculator_{};
+
+      Timeloop * const timeloop_{};
+      const std::weak_ptr<StructuredBlockStorage> blocks_;
+      Welford_T * const velocityWelford_{};
+
+      const BlockDataID velocityFieldId_{};
+      const BlockDataID meanVelocityFieldId_{};
+      const BlockDataID meanTkeSGSFieldId_{};
+
+      const uint_t plotFrequency_{};
+      const uint_t plotStart_{};
+
+      const bool separateFiles_{false};
+      const std::string baseFolder_{"output"};
+      filesystem::path velocityProfilesFilePath_;
+      filesystem::path forcingDataFilePath_;
+   };
+
+   /////////////////////
+   /// Main Function ///
+   /////////////////////
+
+   int main(int argc, char** argv) {
+
+      Environment walberlaEnv(argc, argv);
+
+      if (!walberlaEnv.config()) { WALBERLA_ABORT("No configuration file specified!") }
+
+      ///////////////////////////////////////////////////////
+      /// Block Storage Creation and Simulation Parameter ///
+      ///////////////////////////////////////////////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating block forest...")
+
+      const auto channelParameter = walberlaEnv.config()->getOneBlock("TurbulentChannel");
+      const SimulationParameters simulationParameters(channelParameter);
+
+      // domain creation
+      std::shared_ptr<StructuredBlockForest> blocks;
+      {
+         Vector3< uint_t > numBlocks;
+         Vector3< uint_t > cellsPerBlock;
+         blockforest::calculateCellDistribution(simulationParameters.domainSize,
+                                                uint_c(mpi::MPIManager::instance()->numProcesses()),
+                                                numBlocks, cellsPerBlock);
+
+         const auto & periodicity = simulationParameters.periodicity;
+         const auto & domainSize = simulationParameters.domainSize;
+
+         SetupBlockForest sforest;
+
+         sforest.addWorkloadMemorySUIDAssignmentFunction( blockforest::uniformWorkloadAndMemoryAssignment );
+
+         sforest.init( AABB(0_r, 0_r, 0_r, real_c(domainSize[0]), real_c(domainSize[1]), real_c(domainSize[2])),
+                       numBlocks[0], numBlocks[1], numBlocks[2], periodicity[0], periodicity[1], periodicity[2] );
+
+         // calculate process distribution
+
+         const memory_t memoryLimit = numeric_cast< memory_t >( sforest.getNumberOfBlocks() );
+
+         const blockforest::GlobalLoadBalancing::MetisConfiguration< SetupBlock > metisConfig(
+            true, false, std::bind( blockforest::cellWeightedCommunicationCost, std::placeholders::_1, std::placeholders::_2,
+                                    cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2] ) );
+
+         sforest.calculateProcessDistribution_Default( uint_c( MPIManager::instance()->numProcesses() ), memoryLimit,
+                                                       "hilbert", 10, false, metisConfig );
+
+         if( !MPIManager::instance()->rankValid() )
+            MPIManager::instance()->useWorldComm();
+
+         // create StructuredBlockForest (encapsulates a newly created BlockForest)
+
+         WALBERLA_LOG_INFO_ON_ROOT("SetupBlockForest created successfully:\n" << sforest)
+
+         sforest.writeVTKOutput("domain_decomposition");
+
+         auto bf = std::make_shared< BlockForest >( uint_c( MPIManager::instance()->rank() ), sforest, false );
+
+         blocks = std::make_shared< StructuredBlockForest >( bf, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2] );
+         blocks->createCellBoundingBoxes();
+
+      }
+
+      ////////////////////////////////////
+      /// PDF Field and Velocity Setup ///
+      ////////////////////////////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating fields...")
+
+      // Common Fields
+      const BlockDataID velocityFieldId = field::addToStorage< VectorField_T >(blocks, "velocity", real_c(0.0), codegen::layout);
+      const BlockDataID meanVelocityFieldId = field::addToStorage< VectorField_T >(blocks, "mean velocity", real_c(0.0), codegen::layout);
+      const BlockDataID sopFieldId = field::addToStorage< TensorField_T >(blocks, "sum of products", real_c(0.0), codegen::layout);
+
+      const BlockDataID tkeSgsFieldId = field::addToStorage< ScalarField_T >(blocks, "tke_SGS", real_c(0.0), codegen::layout);
+      const BlockDataID meanTkeSgsFieldId = field::addToStorage< ScalarField_T >(blocks, "mean_tke_SGS", real_c(0.0), codegen::layout);
+
+      const BlockDataID omegaFieldId = field::addToStorage< ScalarField_T >(blocks, "omega_out", real_c(0.0), codegen::layout);
+
+      const BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+
+      // CPU Field for PDFs
+      const BlockDataID pdfFieldId = field::addToStorage< PdfField_T >(blocks, "pdf field", real_c(0.0), codegen::layout);
+
+      ///////////////////////////////////////////
+      /// Force and bulk velocity calculation ///
+      ///////////////////////////////////////////
+
+      ForceCalculator<VectorField_T> forceCalculator(blocks, velocityFieldId, simulationParameters);
+
+      //////////////
+      /// Setter ///
+      //////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Setting up fields...")
+
+      // Velocity field setup
+      setVelocityFieldsAsmuth<VectorField_T>(
+         blocks, velocityFieldId, meanVelocityFieldId,
+         simulationParameters.targetFrictionVelocity, simulationParameters.channelHalfWidth,
+         5.5_r, 0.41_r, simulationParameters.viscosity,
+         simulationParameters.wallAxis, simulationParameters.flowAxis );
+
+      forceCalculator.setBulkVelocity(simulationParameters.targetBulkVelocity);
+      const auto initialForce = forceCalculator.calculateDrivingForce();
+
+      // pdfs setup
+      Setter_T pdfSetter(pdfFieldId, velocityFieldId, initialForce, simulationParameters.density);
+
+      for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+         pdfSetter(blockIt.get());
+
+      /////////////
+      /// Sweep ///
+      /////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating sweeps...")
+
+      const auto omega = lbm::collision_model::omegaFromViscosity(simulationParameters.viscosity);
+      StreamCollideSweep_T streamCollideSweep(omegaFieldId, pdfFieldId, velocityFieldId, initialForce, omega);
+
+      WelfordSweep_T welfordSweep(meanVelocityFieldId, sopFieldId, velocityFieldId, 0_r);
+      TKEWelfordSweep_T welfordTKESweep(meanTkeSgsFieldId, tkeSgsFieldId, 0_r);
+
+      TkeSgsWriter_T tkeSgsWriter(omegaFieldId, pdfFieldId, tkeSgsFieldId, initialForce, omega);
+
+      /////////////////////////
+      /// Boundary Handling ///
+      /////////////////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating boundary handling...")
+
+      const FlagUID fluidFlagUID("Fluid");
+
+      Config::Block boundaryBlock;
+      boundaries::createBoundaryConfig(simulationParameters, boundaryBlock);
+
+      std::unique_ptr<WFB_bottom_T> wfb_bottom_ptr = std::make_unique<WFB_bottom_T>(blocks, meanVelocityFieldId, pdfFieldId, omega, simulationParameters.targetFrictionVelocity);
+      std::unique_ptr<WFB_top_T > wfb_top_ptr = std::make_unique<WFB_top_T>(blocks, meanVelocityFieldId, pdfFieldId, omega, simulationParameters.targetFrictionVelocity);
+
+      NoSlip_T noSlip(blocks, pdfFieldId);
+      FreeSlip_top_T freeSlip_top(blocks, pdfFieldId);
+
+      geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, Config::BlockHandle(&boundaryBlock));
+      geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID);
+
+      noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("NoSlip"), fluidFlagUID);
+      freeSlip_top.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("FreeSlip"), fluidFlagUID);
+      wfb_bottom_ptr->fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("WFB_bottom"), fluidFlagUID);
+      wfb_top_ptr->fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("WFB_top"), fluidFlagUID);
+
+      //////////////
+      /// Output ///
+      //////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating field output...")
+
+      // vtk output
+      auto vtkWriter = vtk::createVTKOutput_BlockData(
+         blocks, "field_writer", simulationParameters.vtkFrequency, 0, false, "vtk_out", "simulation_step",
+         false, false, true, false
+      );
+      vtkWriter->setInitialWriteCallsToSkip(simulationParameters.vtkStart);
+
+      // velocity field writer
+      auto velocityWriter = std::make_shared<field::VTKWriter<VectorField_T>>(velocityFieldId, "instantaneous velocity");
+      vtkWriter->addCellDataWriter(velocityWriter);
+
+      auto meanVelocityFieldWriter = std::make_shared<field::VTKWriter<VectorField_T>>(meanVelocityFieldId, "mean velocity");
+      vtkWriter->addCellDataWriter(meanVelocityFieldWriter);
+
+      // vtk writer
+      {
+         auto flagOutput = vtk::createVTKOutput_BlockData(
+            blocks, "flag_writer", 1, 1, false, "vtk_out", "simulation_step",
+            false, true, true, false
+         );
+         auto flagWriter = std::make_shared<field::VTKWriter<FlagField_T>>(flagFieldId, "flag field");
+         flagOutput->addCellDataWriter(flagWriter);
+         flagOutput->write();
+      }
+
+
+      /////////////////
+      /// Time Loop ///
+      /////////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating timeloop...")
+
+      SweepTimeloop timeloop(blocks->getBlockStorage(), simulationParameters.timesteps);
+
+      // Communication
+      blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks);
+      communication.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
+
+      auto setNewForce = [&](const real_t newForce) {
+         streamCollideSweep.F_x_ = newForce;
+         tkeSgsWriter.F_x_ = newForce;
+         tkeSgsWriter.F_x_ = newForce;
+      };
+
+      // plotting
+      const bool outputSeparateFiles = channelParameter.getParameter<bool>("separate_files", false);
+      const TurbulentChannelPlotter<WelfordSweep_T > plotter(&simulationParameters, &timeloop, &forceCalculator, blocks,
+                                                             velocityFieldId, meanVelocityFieldId,
+                                                             meanTkeSgsFieldId, &welfordSweep,
+                                                             outputSeparateFiles);
+
+      //NOTE must convert sweeps that are altered to lambdas, otherwise copy and counter will stay 0
+      auto welfordLambda = [&welfordSweep, &welfordTKESweep](IBlock * block) {
+         welfordSweep(block);
+         welfordTKESweep(block);
+      };
+
+      auto wfbLambda = [&wfb_bottom_ptr, &wfb_top_ptr](IBlock * block) {
+         wfb_bottom_ptr->operator()(block);
+         wfb_top_ptr->operator()(block);
+      };
+
+      auto streamCollideLambda = [&streamCollideSweep](IBlock * block) {
+         streamCollideSweep(block);
+      };
+
+      // Timeloop
+      timeloop.add() << BeforeFunction(communication, "communication")
+                     << BeforeFunction([&](){forceCalculator.calculateBulkVelocity();}, "bulk velocity calculation")
+                     << BeforeFunction([&](){
+                           const auto newForce = forceCalculator.calculateDrivingForce();
+                           setNewForce(newForce);
+                        }, "new force setter")
+                     << Sweep([](IBlock *){}, "new force setter");
+      timeloop.add() << Sweep(freeSlip_top, "freeSlip");
+      timeloop.add() << Sweep(noSlip, "noSlip");
+      timeloop.add() << Sweep(wfbLambda, "wall function bounce");
+      timeloop.add() << Sweep(streamCollideLambda, "stream and collide");
+      timeloop.add() << BeforeFunction([&](){
+                           const uint_t velCtr = uint_c(welfordSweep.counter_);
+                           if((timeloop.getCurrentTimeStep() == simulationParameters.samplingStart) ||
+                              (timeloop.getCurrentTimeStep() > simulationParameters.samplingStart && simulationParameters.samplingInterval && (velCtr % simulationParameters.samplingInterval == 0))) {
+                              welfordSweep.counter_ = real_t(0);
+                              welfordTKESweep.counter_ = real_t(0);
+                               for(auto & block : *blocks) {
+                                  auto * sopField = block.template getData<TensorField_T >(sopFieldId);
+                                  sopField->setWithGhostLayer(0.0);
+
+                                  auto * tkeField = block.template getData<ScalarField_T>(tkeSgsFieldId);
+                                  tkeField->setWithGhostLayer(0.0);
+                               }
+                           }
+
+                           welfordSweep.counter_ = welfordSweep.counter_ + real_c(1);
+                           welfordTKESweep.counter_ = welfordTKESweep.counter_ + real_c(1);
+                        }, "welford sweep")
+                     << Sweep(welfordLambda, "welford sweep");
+      timeloop.add() << Sweep(tkeSgsWriter, "TKE_SGS writer");
+
+      timeloop.addFuncAfterTimeStep(vtk::writeFiles(vtkWriter), "VTK field output");
+      timeloop.addFuncAfterTimeStep(plotter, "Turbulent quantity plotting");
+
+      // LBM stability check
+      timeloop.addFuncAfterTimeStep( makeSharedFunctor( field::makeStabilityChecker< PdfField_T, FlagField_T >(
+                                       walberlaEnv.config(), blocks, pdfFieldId, flagFieldId, fluidFlagUID ) ),
+                                    "LBM stability check" );
+
+      // Time logger
+      timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), 5_r),
+                                    "remaining time logger");
+
+
+      WALBERLA_LOG_INFO_ON_ROOT("Running timeloop with " << timeloop.getNrOfTimeSteps() - 1 << " timesteps...")
+
+      WcTimingPool timing;
+
+      WcTimer timer;
+      timer.start();
+
+      timeloop.run(timing);
+
+      timer.end();
+
+      double time = timer.max();
+      walberla::mpi::reduceInplace(time, walberla::mpi::MAX);
+
+      const auto timeloopTiming = timing.getReduced();
+      WALBERLA_LOG_INFO_ON_ROOT("Timeloop timing:\n" << *timeloopTiming)
+
+      const walberla::lbm::PerformanceEvaluation<FlagField_T> performance(blocks, flagFieldId, fluidFlagUID);
+      performance.logResultOnRoot(simulationParameters.timesteps, time);
+
+      return EXIT_SUCCESS;
+   }
+
+} // namespace walberla
+
+int main(int argc, char** argv) { return walberla::main(argc, argv); }
diff --git a/apps/benchmarks/TurbulentChannel/TurbulentChannel.prm b/apps/benchmarks/TurbulentChannel/TurbulentChannel.prm
new file mode 100644
index 0000000000000000000000000000000000000000..27c18411d0b24a2b1d4bb03401c4c543eb023d92
--- /dev/null
+++ b/apps/benchmarks/TurbulentChannel/TurbulentChannel.prm
@@ -0,0 +1,35 @@
+
+TurbulentChannel {
+
+    channel_half_width           20;
+    full_channel                  0;
+
+    wall_boundary_condition     WFB;
+
+    target_friction_Reynolds    395;
+    target_bulk_velocity        0.1;
+
+    // turnover_periods              50;
+    timesteps                   100;
+
+    // sampling_start_timesteps      50;
+    // sampling_start_periods      1;
+    // sampling_interval_timesteps   20;
+    // sampling_interval_periods   1;
+
+    // vtk_start_timesteps        50;
+    // vtk_start_periods          1000;
+    // plot_start_timesteps       50;
+    // plot_start_periods         1000;
+    vtk_frequency              10;
+    plot_frequency             10;
+    separate_files                0;
+
+}
+
+StabilityChecker
+{
+   checkFrequency 10000;
+   streamOutput   false;
+   vtkOutput      true;
+}
diff --git a/apps/benchmarks/TurbulentChannel/TurbulentChannel.py b/apps/benchmarks/TurbulentChannel/TurbulentChannel.py
new file mode 100644
index 0000000000000000000000000000000000000000..72df02190b623c89d6a255765b82647954055e97
--- /dev/null
+++ b/apps/benchmarks/TurbulentChannel/TurbulentChannel.py
@@ -0,0 +1,201 @@
+import sympy as sp
+import pystencils as ps
+
+from lbmpy.enums import SubgridScaleModel
+from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil, ForceModel
+from lbmpy.flow_statistics import welford_assignments
+from lbmpy.relaxationrates import lattice_viscosity_from_relaxation_rate
+
+from lbmpy.creationfunctions import create_lb_update_rule
+from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
+
+from lbmpy.boundaries import NoSlip, FreeSlip, WallFunctionBounce
+from lbmpy.boundaries.wall_function_models import SpaldingsLaw, MoninObukhovSimilarityTheory
+from lbmpy.utils import frobenius_norm, second_order_moment_tensor
+
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_from_kernel
+from lbmpy_walberla import generate_boundary
+
+#   =====================
+#      Code Generation
+#   =====================
+
+info_header = """
+#ifndef TURBULENTCHANNEL_INCLUDES
+#define TURBULENTCHANNEL_INCLUDES
+
+#include <stencil/D{d}Q{q}.h>
+
+#include "TurbulentChannel_Sweep.h"
+#include "TurbulentChannel_PackInfo.h"
+#include "TurbulentChannel_Setter.h"
+#include "TurbulentChannel_Welford.h"
+#include "TurbulentChannel_Welford_TKE_SGS.h"
+#include "TurbulentChannel_TKE_SGS_Writer.h"
+
+#include "TurbulentChannel_NoSlip.h"
+#include "TurbulentChannel_FreeSlip_top.h"
+#include "TurbulentChannel_WFB_top.h"
+#include "TurbulentChannel_WFB_bottom.h"
+
+namespace walberla {{
+    namespace codegen {{
+        using Stencil_T = walberla::stencil::D{d}Q{q};
+        static constexpr uint_t flowAxis = {flow_axis};
+        static constexpr uint_t wallAxis = {wall_axis};
+        
+        static constexpr field::Layout layout = field::{layout};
+    }}
+}}
+
+#endif // TURBULENTCHANNEL_INCLUDES
+"""
+
+
+def check_axis(flow_axis, wall_axis):
+    assert flow_axis != wall_axis
+    assert flow_axis < 3
+    assert wall_axis < 3
+
+
+with CodeGeneration() as ctx:
+
+    flow_axis = 0
+    wall_axis = 1
+
+    check_axis(flow_axis=flow_axis, wall_axis=wall_axis)
+
+    #   ========================
+    #      General Parameters
+    #   ========================
+    target = ps.Target.CPU
+
+    data_type = "float64" if ctx.double_accuracy else "float32"
+    stencil = LBStencil(Stencil.D3Q19)
+    omega = sp.Symbol('omega')
+
+    F_x = sp.Symbol('F_x')
+    force_vector = [0] * 3
+    force_vector[flow_axis] = F_x
+
+    layout = 'fzyx'
+
+    normal_direction_top = [0] * 3
+    normal_direction_top[wall_axis] = -1
+    normal_direction_top = tuple(normal_direction_top)
+
+    normal_direction_bottom = [0] * 3
+    normal_direction_bottom[wall_axis] = 1
+    normal_direction_bottom = tuple(normal_direction_bottom)
+
+    #   PDF Fields
+    pdfs, pdfs_tmp = ps.fields(f'pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {data_type}[{stencil.D}D]', layout=layout)
+
+    #   Output Fields
+    omega_field = ps.fields(f"omega_out: {data_type}[{stencil.D}D]", layout=layout)
+    sgs_tke = ps.fields(f"sgs_tke: {data_type}[{stencil.D}D]", layout=layout)
+    mean_sgs_tke = ps.fields(f"mean_sgs_tke: {data_type}[{stencil.D}D]", layout=layout)
+    velocity = ps.fields(f"velocity({stencil.D}): {data_type}[{stencil.D}D]", layout=layout)
+    mean_velocity = ps.fields(f"mean_velocity({stencil.D}): {data_type}[{stencil.D}D]", layout=layout)
+    sum_of_products = ps.fields(f"sum_of_products({stencil.D**2}): {data_type}[{stencil.D}D]", layout=layout)
+
+    # LBM Optimisation
+    lbm_opt = LBMOptimisation(cse_global=True,
+                              symbolic_field=pdfs,
+                              symbolic_temporary_field=pdfs_tmp,
+                              field_layout=layout)
+
+    #   ==================
+    #      Method Setup
+    #   ==================
+
+    lbm_config = LBMConfig(stencil=stencil,
+                           method=Method.CUMULANT,
+                           force_model=ForceModel.GUO,
+                           force=tuple(force_vector),
+                           relaxation_rate=omega,
+                           subgrid_scale_model=SubgridScaleModel.QR,
+                           # galilean_correction=True,
+                           compressible=True,
+                           omega_output_field=omega_field,
+                           output={'velocity': velocity})
+
+    update_rule = create_lb_update_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+    lbm_method = update_rule.method
+
+    #   ========================
+    #      PDF Initialization
+    #   ========================
+
+    initial_rho = sp.Symbol('rho_0')
+
+    pdfs_setter = macroscopic_values_setter(lbm_method,
+                                            initial_rho,
+                                            velocity.center_vector,
+                                            pdfs.center_vector)
+
+    #   LBM Sweep
+    generate_sweep(ctx, "TurbulentChannel_Sweep", update_rule, field_swaps=[(pdfs, pdfs_tmp)], target=target)
+
+    #   Pack Info
+    generate_pack_info_from_kernel(ctx, "TurbulentChannel_PackInfo", update_rule, target=target)
+
+    #   Macroscopic Values Setter
+    generate_sweep(ctx, "TurbulentChannel_Setter", pdfs_setter, target=target, ghost_layers_to_include=1)
+
+    #   Welford update
+    # welford_update = welford_assignments(vector_field=velocity, mean_vector_field=mean_velocity)
+    welford_update = welford_assignments(field=velocity, mean_field=mean_velocity,
+                                         sum_of_products_field=sum_of_products)
+    generate_sweep(ctx, "TurbulentChannel_Welford", welford_update, target=target)
+
+    tke_welford_update = welford_assignments(field=sgs_tke, mean_field=mean_sgs_tke)
+    generate_sweep(ctx, "TurbulentChannel_Welford_TKE_SGS", tke_welford_update, target=target)
+
+    # subgrid TKE output
+    @ps.kernel
+    def tke_sgs_writer():
+        f_neq = sp.Matrix(pdfs.center_vector) - lbm_method.get_equilibrium_terms()
+        rho = lbm_method.conserved_quantity_computation.density_symbol
+        strain_rate = frobenius_norm(-3 * omega_field.center / (2 * rho) * second_order_moment_tensor(f_neq, lbm_method.stencil))
+        eddy_viscosity = lattice_viscosity_from_relaxation_rate(omega_field.center) - lattice_viscosity_from_relaxation_rate(omega)
+
+        sgs_tke.center @= (eddy_viscosity * strain_rate**2)**(2.0/3.0)
+
+    tke_sgs_ac = ps.AssignmentCollection(
+        [lbm_method.conserved_quantity_computation.equilibrium_input_equations_from_pdfs(pdfs.center_vector),
+         *tke_sgs_writer]
+    )
+    generate_sweep(ctx, "TurbulentChannel_TKE_SGS_Writer", tke_sgs_ac)
+
+    #   Boundary conditions
+    nu = lattice_viscosity_from_relaxation_rate(omega)
+    u_tau_target = sp.Symbol("target_u_tau")
+
+    noslip = NoSlip()
+    freeslip_top = FreeSlip(stencil, normal_direction=normal_direction_top)
+    wfb_top = WallFunctionBounce(lbm_method, pdfs, normal_direction=normal_direction_top,
+                                 wall_function_model=SpaldingsLaw(viscosity=nu,
+                                                                  kappa=0.41, b=5.5, newton_steps=5),
+                                 mean_velocity=mean_velocity, data_type=data_type,
+                                 target_friction_velocity=u_tau_target)
+    wfb_bottom = WallFunctionBounce(lbm_method, pdfs, normal_direction=normal_direction_bottom,
+                                    wall_function_model=SpaldingsLaw(viscosity=nu,
+                                                                     kappa=0.41, b=5.5, newton_steps=5),
+                                    mean_velocity=mean_velocity, data_type=data_type,
+                                    target_friction_velocity=u_tau_target)
+
+    generate_boundary(ctx, "TurbulentChannel_NoSlip", noslip, lbm_method, target=target)
+    generate_boundary(ctx, "TurbulentChannel_FreeSlip_top", freeslip_top, lbm_method, target=target)
+    generate_boundary(ctx, "TurbulentChannel_WFB_bottom", wfb_bottom, lbm_method, target=target)
+    generate_boundary(ctx, "TurbulentChannel_WFB_top", wfb_top, lbm_method, target=target)
+
+    info_header_params = {
+        'layout': layout,
+        'd': stencil.D,
+        'q': stencil.Q,
+        'flow_axis': flow_axis,
+        'wall_axis': wall_axis
+    }
+
+    ctx.write_file("CodegenIncludes.h", info_header.format(**info_header_params))
diff --git a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
index dfcd22a87e6942fb7ac2bc5789ac92fdd65fec9f..4674cfae92aaaf652b8e83e4e1dcae9c87427c1a 100644
--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
@@ -174,7 +174,7 @@ int main(int argc, char** argv)
       {
          auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
                                                          "simulation_step", false, true, true, false, 0);
-         auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldId, "vel");
+         auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(velFieldId, "vel");
          vtkOutput->addCellDataWriter(velWriter);
 
          vtkOutput->addBeforeFunction([&]() {
diff --git a/apps/benchmarks/UniformGridCPU/UniformGridCPU.py b/apps/benchmarks/UniformGridCPU/UniformGridCPU.py
index ae9ec4f1bd6e26a8474099cd6f03d4c40f114854..5a600eade461c5698c35487ce6aca8a78207aa63 100644
--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.py
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.py
@@ -10,7 +10,7 @@ from lbmpy.advanced_streaming import is_inplace
 from lbmpy.advanced_streaming.utility import streaming_patterns, get_accessor, Timestep
 from lbmpy.boundaries import NoSlip, UBB
 from lbmpy.creationfunctions import LBMConfig, LBMOptimisation, LBStencil, create_lb_collision_rule
-from lbmpy.enums import Method, Stencil
+from lbmpy.enums import Method, Stencil, SubgridScaleModel
 from lbmpy.fieldaccess import CollideOnlyInplaceAccessor
 from lbmpy.moments import get_default_moment_set_for_stencil
 from lbmpy.updatekernels import create_stream_only_kernel
@@ -73,7 +73,7 @@ options_dict = {
     },
     'smagorinsky': {
         'method': Method.SRT,
-        'smagorinsky': False,
+        'subgrid_scale_model': SubgridScaleModel.SMAGORINSKY,
         'relaxation_rate': omega,
     }
 }
@@ -152,7 +152,8 @@ with CodeGeneration() as ctx:
                          lbm_config=lbm_config, lbm_optimisation=lbm_opt,
                          nonuniform=False, boundaries=[no_slip, ubb],
                          macroscopic_fields=macroscopic_fields,
-                         cpu_openmp=openmp, cpu_vectorize_info=cpu_vec)
+                         cpu_openmp=openmp, cpu_vectorize_info=cpu_vec,
+                         set_pre_collision_pdfs=False)
 
     # Stream only kernel
     generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel,
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
index fdc8969d626b866b978dfd1260565c50f96f01b8..91b7a02107c8e11e8b760aee1207895e436c5d3a 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -205,7 +205,7 @@ int main(int argc, char** argv)
       {
          auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
                                                          "simulation_step", false, true, true, false, 0);
-         auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldCpuID, "vel");
+         auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(velFieldCpuID, "vel");
          vtkOutput->addCellDataWriter(velWriter);
 
          vtkOutput->addBeforeFunction([&]() {
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
index 3d7579e5bcb3f3713f59a9afd94d7fed790c21e9..09235c4340a0b0946f72c513a219bca3c28dd724 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
@@ -8,6 +8,7 @@ from pystencils.typing import TypedSymbol
 from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
 
 from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil
+from lbmpy.enums import SubgridScaleModel
 from lbmpy.advanced_streaming import is_inplace
 from lbmpy.advanced_streaming.utility import streaming_patterns
 from lbmpy.boundaries import NoSlip, UBB
@@ -84,7 +85,7 @@ options_dict = {
     },
     'smagorinsky': {
         'method': Method.SRT,
-        'smagorinsky': False,
+        'subgrid_scale_model': SubgridScaleModel.SMAGORINSKY,
         'relaxation_rate': omega,
     }
 }
@@ -168,7 +169,7 @@ with CodeGeneration() as ctx:
                          nonuniform=False, boundaries=[no_slip, ubb],
                          macroscopic_fields=macroscopic_fields,
                          target=ps.Target.GPU, gpu_indexing_params=gpu_indexing_params,
-                         max_threads=max_threads)
+                         max_threads=max_threads, set_pre_collision_pdfs=False)
 
     # Stream only kernel
     vp = [('int32_t', 'cudaBlockSize0'), ('int32_t', 'cudaBlockSize1'), ('int32_t', 'cudaBlockSize2')]
diff --git a/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.py
index 619c9fbfce1bea16a3008ce3efd4d136a63656f8..a855b9d211378d326fa5c7734e88e36e59141e2b 100644
--- a/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.py
@@ -1,6 +1,7 @@
 import sympy as sp
 import numpy as np
 import pystencils as ps
+from lbmpy.enums import Method, Stencil, SubgridScaleModel
 from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule, create_lb_collision_rule
 from lbmpy.boundaries import NoSlip, UBB
 from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor
@@ -65,7 +66,7 @@ options_dict = {
     'smagorinsky': {
         'method': 'srt',
         'stencil': 'D3Q19',
-        'smagorinsky': True,
+        'subgrid_scale_model': SubgridScaleModel.SMAGORINSKY,
         'relaxation_rate': omega,
     },
     'cumulant': {
diff --git a/apps/tutorials/basics/01_BlocksAndFields.dox b/apps/tutorials/basics/01_BlocksAndFields.dox
index 944c64cb50331323490d89a6ccb266418fd8de5b..4feaf75a4ed7f104ad0f37514bddbe5b0d0a5c1a 100644
--- a/apps/tutorials/basics/01_BlocksAndFields.dox
+++ b/apps/tutorials/basics/01_BlocksAndFields.dox
@@ -6,7 +6,7 @@ namespace walberla {
 \brief Introduction to block structure and field.
 
 This tutorial walks you through the process of creating a simple waLBerla application. 
-The source file of this tutorial can be found in apps/tutorials/01_BlocksAndFields.cpp.
+The source file of this tutorial can be found in `apps/tutorials/01_BlocksAndFields.cpp`.
 To compile and run this example, go to your build directory into `apps/tutorials` type `make`
 and run the generated executable.
 
@@ -153,7 +153,7 @@ Using this setup mechanism, waLBerla does not enforce that the fields have the s
 
 Remember: For waLBerla, a block is just a container for arbitrary data - and a field is just an "arbitrary" data item stored on each block.
 Block data does not have to be any waLBerla data structure. It is possible to store any type of data on a block, 
-so instead of using the field class, we could, for example, have used a std::vector<std::vector<double> > to store our lattice.
+so instead of using the field class, we could, for example, have used a `std::vector<std::vector<double>>` to store our lattice.
 
 The callback function can now be registered at the block storage with the following piece of code:
 
@@ -171,7 +171,7 @@ dock widget can then be used to display slices of the field.
 
 \image html tutorial_basics01_field.jpeg
 
-The next tutorial contains the writing of algorithms operating on block data: \ref tutorial02
+The next tutorial contains the writing of algorithms operating on block data: \ref tutorial_basics_02
 
 \tableofcontents
 
diff --git a/apps/tutorials/gpu/01_GameOfLife_cuda.dox b/apps/tutorials/gpu/01_GameOfLife_cuda.dox
index 8794e6c520ffb31d2c3653622cb2f4b4ba4b6eda..0c811bea939f122291cb46962f9087f64c4c62f1 100644
--- a/apps/tutorials/gpu/01_GameOfLife_cuda.dox
+++ b/apps/tutorials/gpu/01_GameOfLife_cuda.dox
@@ -37,9 +37,9 @@ auto hostFieldAllocator = make_shared< gpu::HostFieldAllocator<real_t> >();
 BlockDataID const cpuFieldID =field::addToStorage< ScalarField >(blocks, "CPU Field", real_c(0.0), field::fzyx, uint_c(1), hostFieldAllocator);
 \endcode
 
-Now we initialize the CPU field just like in the previous tutorial \ref tutorial_basics03 .
+Now we initialize the CPU field just like in the previous tutorial \ref tutorial_basics_03 .
 Then two GPU fields are created: "source" and "destination" field. The helper function
-gpu::addGPUFieldToStorage() creates a gpu::GPUField field of the same size and layout of the given
+\ref gpu::addGPUFieldToStorage() creates a \ref gpu::GPUField field of the same size and layout of the given
 CPU field:
 \code
 BlockDataID gpuFieldSrcID = gpu::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
@@ -118,10 +118,10 @@ Note that copying data is costly and thus we don't want to do this in every time
 
 \section gpu01_comm Communication
 
-For this tutorial we use the gpu::communication::UniformGPUScheme that first collects all data in a buffer and
-sends only one message per communication step and neighbor. For the PackInfo we use the MemcpyPackInfo. It receives
-a buffer located on the GPU and fills it using memcpy operations
-If the GPU library is build with MPI support this buffer can be send to other GPUs without a copy to the CPU.
+For this tutorial we use the \ref gpu::communication::UniformGPUScheme that first collects all data in a buffer and
+sends only one message per communication step and neighbor. For the `PackInfo` we use the \ref gpu::communication::MemcpyPackInfo.
+It receives a buffer located on the GPU and fills it using memcpy operations.
+If the GPU library is built with MPI support this buffer can be sent to other GPUs without a copy to the CPU.
 Otherwise the copying will be done in the back by the communication class.
 
 \code
diff --git a/apps/tutorials/lbm/06_LBBoundaryCondition.cpp b/apps/tutorials/lbm/06_LBBoundaryCondition.cpp
index ae6f612cbb070298a16d0329d258d6256a217756..8d095d222eda3a4e3822ae147d74d020b214ad6f 100644
--- a/apps/tutorials/lbm/06_LBBoundaryCondition.cpp
+++ b/apps/tutorials/lbm/06_LBBoundaryCondition.cpp
@@ -268,8 +268,9 @@ BoundaryHandling_T* MyBoundaryHandling::operator()(IBlock* const block,
    else if (setup_.inflowType == "ParserUBB")
    {
       //! [forceBoundary_ParserUBB_eqs]
-      char x_eq[150];
-      sprintf(x_eq, "0.1*4/%f/%f * y * (%f - y) * 0.5 * (1 - cos(2 * 3.1415926538 * t / %f));", H, H, H, setup_.period);
+      const uint_t maxSize = 150;
+      char x_eq[maxSize];
+      snprintf(x_eq, maxSize, "0.1*4/%f/%f * y * (%f - y) * 0.5 * (1 - cos(2 * 3.1415926538 * t / %f));", H, H, H, setup_.period);
 
       std::array< std::string, 3 > eqs = { x_eq, "0", "0" };
       handling->forceBoundary(ParserUBBFlagUID, west, ParserUBB_T::Parser(eqs));
diff --git a/cmake/FindOpenMesh.cmake b/cmake/FindOpenMesh.cmake
index 4dbb7f7dd06fe02b5019b4a52e76287eafbc8651..76ca8682a1e6881a0564fb67234c8230362854eb 100644
--- a/cmake/FindOpenMesh.cmake
+++ b/cmake/FindOpenMesh.cmake
@@ -53,7 +53,7 @@
 #                                                                            
 #===========================================================================
 
-cmake_minimum_required(VERSION 3.3.0)
+cmake_minimum_required(VERSION 3.5.0)
 
 #if already found via finder or simulated finder in openmesh CMakeLists.txt, skip the search
 IF (NOT OPENMESH_FOUND) 
diff --git a/cmake/waLBerlaFunctions.cmake b/cmake/waLBerlaFunctions.cmake
index 4962e18cabc6a6128ac8b003c99cfe6110d45520..cd0796d2bfe589a01f408d4124a90d34f9cfbf19 100644
--- a/cmake/waLBerlaFunctions.cmake
+++ b/cmake/waLBerlaFunctions.cmake
@@ -39,7 +39,7 @@ function ( waLBerla_add_module )
 
     set( ALL_DEPENDENCIES ${ARG_DEPENDS} ${ARG_OPTIONAL_DEPENDS})
     # Module name is the directory relative to WALBERLA_MODULE_DIRS
-    get_current_module_name ( moduleName )
+    get_current_module_name ( )
     get_module_library_name ( moduleLibraryName ${moduleName} )
 
     # Test if all required libraries are available
@@ -174,6 +174,9 @@ function ( waLBerla_add_executable )
     foreach ( depMod ${ARG_DEPENDS} )
         get_module_library_name ( depModLibraryName ${depMod} )
         if( NOT TARGET ${depModLibraryName} )
+            if( WALBERLA_DEPS_ERROR )
+               message( FATAL_ERROR "Module ${depMod} is missing to build target ${ARG_NAME}" )
+            endif()
             if( WALBERLA_LOG_SKIPPED )
                message ( STATUS "Skipping ${ARG_NAME} since dependent module ${depMod} was not built" )
             endif()
@@ -231,7 +234,7 @@ function ( waLBerla_compile_test )
     cmake_parse_arguments( ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} )
 
     # Module name is the directory relative to WALBERLA_MODULE_DIRS
-    get_current_module_name ( moduleName )
+    get_current_module_name ( )
 
     # Filename of first source file is used as name for testcase if no name was given
     if( NOT ARG_NAME )
@@ -362,7 +365,7 @@ function ( waLBerla_execute_test )
    if( ARG_NO_MODULE_LABEL )
       set_tests_properties ( ${ARG_NAME} PROPERTIES LABELS "${ARG_LABELS}" )
    else()
-      get_current_module_name ( moduleName  )
+      get_current_module_name ( )
       set_tests_properties ( ${ARG_NAME} PROPERTIES LABELS "${moduleName} ${ARG_LABELS}" )
    endif()
 
diff --git a/cmake/waLBerlaModuleDependencySystem.cmake b/cmake/waLBerlaModuleDependencySystem.cmake
index b955d81f4e5833ccba8e2122a614be6241eb46de..fc3ad4f7e8f6979fb867f0e9fea4f8fbe25194ba 100644
--- a/cmake/waLBerlaModuleDependencySystem.cmake
+++ b/cmake/waLBerlaModuleDependencySystem.cmake
@@ -46,19 +46,20 @@
 # Determine Module name using the current folder 
 # 
 # moduleFolder is the current source directory relative to a folder in WALBERLA_MODULE_DIRS
-# If more arguments are given, these are prepended to WALBERLA_MODULE_DIR
+# The variable moduleName will be set in PARENT_SCOPE and is the first folder in WALBERLA_MODULE_DIRS
 # Example:
 #    If CMAKE_CURRENT_SOURCE_DIR is /src/core/field and /src/ is an element in WALBERLA_MODULE_DIRS,
-#    then module name is "core/field"
+#    then moduleName is "core"
 #
 #######################################################################################################################
-function ( get_current_module_name  moduleNameOut )
+function ( get_current_module_name  )
 
     foreach( moduleDir ${ARGN} ${WALBERLA_MODULE_DIRS} )
-        get_filename_component( moduleNameShort ${CMAKE_CURRENT_SOURCE_DIR} NAME_WE )
         file( RELATIVE_PATH moduleFolder ${moduleDir} ${CMAKE_CURRENT_SOURCE_DIR} )
         if ( NOT ${moduleFolder} MATCHES "\\.\\./.*" )
-           set ( ${moduleNameOut} ${moduleFolder} PARENT_SCOPE )
+           #append / to make cmake_path also work with one directory only
+           string(REGEX REPLACE "(.*)/.*" "\\1" moduleNameOut ${moduleFolder})
+           set(moduleName ${moduleNameOut} PARENT_SCOPE)
            return() 
         endif()
     endforeach()
diff --git a/python/lbmpy_walberla/additional_data_handler.py b/python/lbmpy_walberla/additional_data_handler.py
index 770557d14f34fac81e77ab2f8e48d667d10e89ae..daaab32fd40aff5f2150495bf4782ef61a282dae 100644
--- a/python/lbmpy_walberla/additional_data_handler.py
+++ b/python/lbmpy_walberla/additional_data_handler.py
@@ -9,21 +9,38 @@ try:
 except ImportError:
     from lbmpy.custom_code_nodes import MirroredStencilDirections
 from lbmpy.boundaries.boundaryconditions import LbBoundary
-from lbmpy.boundaries import ExtrapolationOutflow, FreeSlip, UBB, DiffusionDirichlet
+from lbmpy.boundaries import (ExtrapolationOutflow, FreeSlip, UBB, DiffusionDirichlet,
+                              NoSlipLinearBouzidi, QuadraticBounceBack)
 
 from pystencils_walberla.additional_data_handler import AdditionalDataHandler
 
 
-def default_additional_data_handler(boundary_obj: LbBoundary, lb_method, field_name, target=Target.CPU):
+interpolation_bc_check_template = """
+if(!isFlagSet(it.neighbor({cx}, {cy}, {cz}, 0), domainFlag)){{ 
+   //Linear-Bouzidi requires 2 fluid nodes: if the 2nd node is not available abort, 
+   //apply Bounce Back at that point. This clearly lowers the accuracy and makes inconsistent the
+   //calculation of the total force
+   element.q = -1.0;
+   WALBERLA_LOG_INFO_ON_ROOT("Warning: Bouzidi cannot be applied at least on one boundary link.")
+}} //end if to check Bouzidi applicability  
+"""
+
+
+def default_additional_data_handler(boundary_obj: LbBoundary, lb_method, field_name, target=Target.CPU,
+                                    pdfs_data_type=None, zeroth_timestep=None):
     if not boundary_obj.additional_data:
         return None
-
     if isinstance(boundary_obj, FreeSlip):
         return FreeSlipAdditionalDataHandler(lb_method.stencil, boundary_obj)
     elif isinstance(boundary_obj, UBB):
         return UBBAdditionalDataHandler(lb_method.stencil, boundary_obj)
     elif isinstance(boundary_obj, ExtrapolationOutflow):
-        return OutflowAdditionalDataHandler(lb_method.stencil, boundary_obj, target=target, field_name=field_name)
+        return OutflowAdditionalDataHandler(lb_method.stencil, boundary_obj, target=target, field_name=field_name,
+                                            pdfs_data_type=pdfs_data_type, zeroth_timestep=zeroth_timestep)
+    elif isinstance(boundary_obj, NoSlipLinearBouzidi):
+        return NoSlipLinearBouzidiAdditionalDataHandler(lb_method.stencil, boundary_obj)
+    elif isinstance(boundary_obj, QuadraticBounceBack):
+        return QuadraticBounceBackAdditionalDataHandler(lb_method.stencil, boundary_obj)
     else:
         raise ValueError(f"No default AdditionalDataHandler available for boundary of type {boundary_obj.__class__}")
 
@@ -76,7 +93,7 @@ class FreeSlipAdditionalDataHandler(AdditionalDataHandler):
                           f"   element.wnx = {inv_offset[0]};",
                           f"   element.wny = {inv_offset[1]};",
                           f"   element.wnz = {inv_offset[2]};",
-                          f"   ref_dir = {direction};",
+                          f"   ref_dir = {self._walberla_stencil.index(inv_offset)};",
                           "}"]
         elif self._dim == 2:
             init_list += ["// concave corner (neighbors are non-fluid)",
@@ -84,7 +101,7 @@ class FreeSlipAdditionalDataHandler(AdditionalDataHandler):
                           "{",
                           f"   element.wnx = {inv_offset[0]};",
                           f"   element.wny = {inv_offset[1]};",
-                          f"   ref_dir = {direction};",
+                          f"   ref_dir = {self._walberla_stencil.index(inv_offset)};",
                           "}"]
         init_list.append("element.ref_dir = ref_dir;")
 
@@ -107,7 +124,7 @@ class UBBAdditionalDataHandler(AdditionalDataHandler):
 
     @property
     def initialiser_list(self):
-        return "elementInitaliser(velocityCallback),"
+        return "elementInitialiser(velocityCallback),"
 
     @property
     def additional_arguments_for_fill_function(self):
@@ -117,23 +134,117 @@ class UBBAdditionalDataHandler(AdditionalDataHandler):
     def additional_parameters_for_fill_function(self):
         return " const shared_ptr<StructuredBlockForest> &blocks, "
 
-    def data_initialisation(self, direction):
-        init_list = ["Vector3<real_t> InitialisatonAdditionalData = elementInitaliser(Cell(it.x(), it.y(), it.z()), "
-                     "blocks, *block);", "element.vel_0 = InitialisatonAdditionalData[0];",
-                     "element.vel_1 = InitialisatonAdditionalData[1];"]
+    def data_initialisation(self, *_):
+        init_list = ["Vector3<real_t> InitialisationAdditionalData = elementInitialiser(Cell(it.x(), it.y(), it.z()), "
+                     "blocks, *block);", "element.vel_0 = InitialisationAdditionalData[0];",
+                     "element.vel_1 = InitialisationAdditionalData[1];"]
         if self._dim == 3:
-            init_list.append("element.vel_2 = InitialisatonAdditionalData[2];")
+            init_list.append("element.vel_2 = InitialisationAdditionalData[2];")
 
         return "\n".join(init_list)
 
     @property
     def additional_member_variable(self):
         return "std::function<Vector3<real_t>(const Cell &, const shared_ptr<StructuredBlockForest>&, IBlock&)> " \
-               "elementInitaliser; "
+               "elementInitialiser; "
+
+
+class NoSlipLinearBouzidiAdditionalDataHandler(AdditionalDataHandler):
+    def __init__(self, stencil, boundary_object):
+        assert isinstance(boundary_object, NoSlipLinearBouzidi)
+
+        self._dtype = BasicType(boundary_object.data_type).c_name
+        self._blocks = "const shared_ptr<StructuredBlockForest>&, IBlock&)>"
+        super(NoSlipLinearBouzidiAdditionalDataHandler, self).__init__(stencil=stencil)
+
+    @property
+    def constructor_argument_name(self):
+        return "wallDistanceBouzidi"
+
+    @property
+    def constructor_arguments(self):
+        return f", std::function<{self._dtype}(const Cell &, const Cell &, {self._blocks}&" \
+               f"{self.constructor_argument_name} "
+
+    @property
+    def initialiser_list(self):
+        return f"elementInitialiser({self.constructor_argument_name}),"
+
+    @property
+    def additional_arguments_for_fill_function(self):
+        return "blocks, "
+
+    @property
+    def additional_parameters_for_fill_function(self):
+        return " const shared_ptr<StructuredBlockForest> &blocks, "
+
+    def data_initialisation(self, direction):
+        cx = self._walberla_stencil[direction][0]
+        cy = self._walberla_stencil[direction][1]
+        cz = self._walberla_stencil[direction][2]
+        fluid_cell = "Cell(it.x(), it.y(), it.z())"
+        boundary_cell = f"Cell(it.x() + {cx}, it.y() + {cy}, it.z() + {cz})"
+        check_str = interpolation_bc_check_template.format(cx=-cx, cy=-cy, cz=-cz)
+        init_element = f"elementInitialiser({fluid_cell}, {boundary_cell}, blocks, *block)"
+        init_list = [f"const {self._dtype} q = (({self._dtype}) {init_element});",
+                     "element.q = q;",
+                     check_str]
+
+        return "\n".join(init_list)
+
+    @property
+    def additional_member_variable(self):
+        return f"std::function<{self._dtype}(const Cell &, const Cell &, {self._blocks} elementInitialiser; "
+
+
+class QuadraticBounceBackAdditionalDataHandler(AdditionalDataHandler):
+    def __init__(self, stencil, boundary_object):
+        assert isinstance(boundary_object, QuadraticBounceBack)
+
+        self._dtype = BasicType(boundary_object.data_type).c_name
+        self._blocks = "const shared_ptr<StructuredBlockForest>&, IBlock&)>"
+        super(QuadraticBounceBackAdditionalDataHandler, self).__init__(stencil=stencil)
+
+    @property
+    def constructor_argument_name(self):
+        return "wallDistanceQuadraticBB"
+
+    @property
+    def constructor_arguments(self):
+        return f", std::function<{self._dtype}(const Cell &, const Cell &, {self._blocks}&" \
+               f"{self.constructor_argument_name} "
+
+    @property
+    def initialiser_list(self):
+        return f"elementInitialiser({self.constructor_argument_name}),"
+
+    @property
+    def additional_arguments_for_fill_function(self):
+        return "blocks, "
+
+    @property
+    def additional_parameters_for_fill_function(self):
+        return " const shared_ptr<StructuredBlockForest> &blocks, "
+
+    def data_initialisation(self, direction):
+        cx = self._walberla_stencil[direction][0]
+        cy = self._walberla_stencil[direction][1]
+        cz = self._walberla_stencil[direction][2]
+        fluid_cell = "Cell(it.x(), it.y(), it.z())"
+        boundary_cell = f"Cell(it.x() + {cx}, it.y() + {cy}, it.z() + {cz})"
+        init_element = f"elementInitialiser({fluid_cell}, {boundary_cell}, blocks, *block)"
+        init_list = [f"const {self._dtype} q = (({self._dtype}) {init_element});", "element.q = q;"]
+
+        return "\n".join(init_list)
+
+    @property
+    def additional_member_variable(self):
+        return f"std::function<{self._dtype}(const Cell &, const Cell &, {self._blocks} elementInitialiser; "
 
 
 class OutflowAdditionalDataHandler(AdditionalDataHandler):
-    def __init__(self, stencil, boundary_object, target=Target.CPU, field_name='pdfs', pdfs_data_type=None, zeroth_timestep=None):
+    def __init__(self, stencil, boundary_object, target=Target.CPU, field_name='pdfs',
+                 pdfs_data_type=None, zeroth_timestep=None):
         assert isinstance(boundary_object, ExtrapolationOutflow)
         self._stencil = boundary_object.stencil
         self._lb_method = boundary_object.lb_method
diff --git a/python/lbmpy_walberla/boundary_collection.py b/python/lbmpy_walberla/boundary_collection.py
index 082567204acf47b97a1344de8b1ae38288ded567..3830d8bb460bddd2c537ee245c3cc5b689a718da 100644
--- a/python/lbmpy_walberla/boundary_collection.py
+++ b/python/lbmpy_walberla/boundary_collection.py
@@ -113,7 +113,8 @@ def __generate_alternating_lbm_boundary(generation_context,
                                         **create_kernel_params):
     if boundary_object.additional_data and additional_data_handler is None:
         target = create_kernel_params.get('target', Target.CPU)
-        additional_data_handler = default_additional_data_handler(boundary_object, lb_method, field_name, target=target)
+        additional_data_handler = default_additional_data_handler(boundary_object, lb_method, field_name,
+                                                                  target=target, pdfs_data_type=field_data_type)
 
     timestep_param_name = 'timestep'
     timestep_param_dtype = np.uint8
diff --git a/python/lbmpy_walberla/sweep_collection.py b/python/lbmpy_walberla/sweep_collection.py
index 5fe4892ab3bc9740f3ca04775363586b42134e96..bc8bdda49dcb88f897f7fa1ce23c9a9b101660a3 100644
--- a/python/lbmpy_walberla/sweep_collection.py
+++ b/python/lbmpy_walberla/sweep_collection.py
@@ -28,7 +28,7 @@ def generate_lbm_sweep_collection(ctx, class_name: str, collision_rule: LbmColli
                                   lbm_config: LBMConfig, lbm_optimisation: LBMOptimisation,
                                   refinement_scaling=None, macroscopic_fields: Dict[str, Field] = None,
                                   target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None,
-                                  max_threads=None,
+                                  max_threads=None, set_pre_collision_pdfs=True,
                                   **create_kernel_params):
 
     config = config_from_context(ctx, target=target, data_type=data_type,
@@ -76,7 +76,7 @@ def generate_lbm_sweep_collection(ctx, class_name: str, collision_rule: LbmColli
     config_unoptimized = replace(config, cpu_vectorize_info=None, cpu_prepend_optimizations=[], cpu_blocking=None)
 
     setter_family = get_setter_family(class_name, lb_method, src_field, streaming_pattern, macroscopic_fields,
-                                      config_unoptimized)
+                                      config_unoptimized, set_pre_collision_pdfs)
     setter_generator = kernel_family_function_generator('initialise', setter_family,
                                                         namespace='lbm', max_threads=max_threads)
     function_generators.append(setter_generator)
@@ -167,7 +167,8 @@ def lbm_kernel_family(class_name, kernel_name,
     return family
 
 
-def get_setter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopic_fields, config: CreateKernelConfig):
+def get_setter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopic_fields,
+                      config: CreateKernelConfig, set_pre_collision_pdfs: bool):
     dim = lb_method.stencil.D
     density = macroscopic_fields.get('density', 1.0)
     velocity = macroscopic_fields.get('velocity', [0.0] * dim)
@@ -184,7 +185,8 @@ def get_setter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopi
             timestep_suffix = str(timestep)
             setter = macroscopic_values_setter(lb_method,
                                                density=density, velocity=velocity, pdfs=pdfs,
-                                               streaming_pattern=streaming_pattern, previous_timestep=timestep)
+                                               streaming_pattern=streaming_pattern, previous_timestep=timestep,
+                                               set_pre_collision_pdfs=set_pre_collision_pdfs)
 
             if default_dtype != pdfs.dtype:
                 setter = add_subexpressions_for_field_reads(setter, data_type=default_dtype)
@@ -198,7 +200,8 @@ def get_setter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopi
         timestep = Timestep.BOTH
         setter = macroscopic_values_setter(lb_method,
                                            density=density, velocity=velocity, pdfs=pdfs,
-                                           streaming_pattern=streaming_pattern, previous_timestep=timestep)
+                                           streaming_pattern=streaming_pattern, previous_timestep=timestep,
+                                           set_pre_collision_pdfs=set_pre_collision_pdfs)
 
         setter_ast = create_kernel(setter, config=config)
         setter_ast.function_name = 'kernel_initialise'
diff --git a/python/lbmpy_walberla/templates/BoundaryCollection.tmpl.h b/python/lbmpy_walberla/templates/BoundaryCollection.tmpl.h
index 47f313860dacc6cf69c783047c3ed53e82bcb91b..453f78e093a5087bd80a3bf37dd1b20300558f3a 100644
--- a/python/lbmpy_walberla/templates/BoundaryCollection.tmpl.h
+++ b/python/lbmpy_walberla/templates/BoundaryCollection.tmpl.h
@@ -46,7 +46,7 @@ class {{class_name}}
    {
       {% for object_name, boundary_class, kernel, additional_data_handler in zip(object_names, boundary_classes, kernel_list, additional_data_handlers) -%}
 
-      {{object_name}} = std::make_shared< {{boundary_class}} >({{- ["blocks", "pdfsID", [kernel|generate_function_collection_call(['indexVector', 'indexVectorSize', 'pdfs', 'timestep', 'gpuStream'])], additional_data_handler.constructor_argument_name] | type_identifier_list -}});
+      {{object_name}} = std::make_shared< {{boundary_class}} >({{- ["blocks", [kernel|generate_function_collection_call(['indexVector', 'indexVectorSize', 'timestep', 'gpuStream'], use_field_ids=True)], additional_data_handler.constructor_argument_name] | type_identifier_list -}});
       {% endfor %}
 
       {% for object_name, flag_uid in zip(object_names, flag_uids) -%}
diff --git a/python/lbmpy_walberla/walberla_lbm_package.py b/python/lbmpy_walberla/walberla_lbm_package.py
index 80b37a4a90f717e79ea60890a802a363c45dde52..ea583181f6c4165863d68a703acd341c8c41d71e 100644
--- a/python/lbmpy_walberla/walberla_lbm_package.py
+++ b/python/lbmpy_walberla/walberla_lbm_package.py
@@ -20,7 +20,7 @@ def generate_lbm_package(ctx: CodeGenerationContext, name: str,
                          target: Target = Target.CPU,
                          data_type=None, pdfs_data_type=None,
                          cpu_openmp=None, cpu_vectorize_info=None,
-                         max_threads=None,
+                         max_threads=None, set_pre_collision_pdfs=True,
                          **kernel_parameters):
 
     if macroscopic_fields is None:
@@ -48,6 +48,7 @@ def generate_lbm_package(ctx: CodeGenerationContext, name: str,
                                   target=target, data_type=data_type,
                                   cpu_openmp=cpu_openmp, cpu_vectorize_info=cpu_vectorize_info,
                                   max_threads=max_threads,
+                                  set_pre_collision_pdfs=set_pre_collision_pdfs,
                                   **kernel_parameters)
 
     spatial_shape = None
diff --git a/python/pystencils_walberla/jinja_filters.py b/python/pystencils_walberla/jinja_filters.py
index b2413bcefe8c4a4468e4971c1b8901116519c57d..6d05bf8ffd51808821311bf09f0db263570983b9 100644
--- a/python/pystencils_walberla/jinja_filters.py
+++ b/python/pystencils_walberla/jinja_filters.py
@@ -378,21 +378,40 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st
 
 
 @jinja2_context_decorator
-def generate_function_collection_call(ctx, kernel_info, parameters_to_ignore=(), cell_interval=None, ghost_layers=None):
+def generate_function_collection_call(ctx, kernel, parameters_to_ignore=(),
+                                      cell_interval=None, ghost_layers=None, use_field_ids=False):
+
+    """Generates the function call to a pystencils kernel. It can be understood as a lightweight version of
+       `generate_call`. Thus, it will only generate the parameters needed to call the kernel as a list of strings.
+
+    Args:
+        ctx: code generation context
+        kernel: pystencils kernel
+        parameters_to_ignore: In some cases not all parameters need to be printed. This is especially the case when
+                              fixed parameters exist that are hardcoded in the jinja template.
+        cell_interval: Defines the name (string) of a walberla CellInterval object in scope.
+        ghost_layers: Defines the name (string) of a variable to define the number of used ghost_layers.
+        use_field_ids: If set to true field names will be printed with the suffix `ID_`, to indicated that
+                       a BlockDataID is passed.
+    """
+
     target = translate_target(ctx['target'])
     is_gpu = target == Target.GPU
 
     parameters = []
-    for param in kernel_info.parameters:
+    for param in kernel.parameters:
         if param.is_field_pointer and param.field_name not in parameters_to_ignore:
-            parameters.append(param.field_name)
+            if use_field_ids:
+                parameters.append(f"{param.field_name}ID_")
+            else:
+                parameters.append(param.field_name)
 
-    for param in kernel_info.parameters:
+    for param in kernel.parameters:
         if not param.is_field_parameter and param.symbol.name not in parameters_to_ignore:
             parameters.append(param.symbol.name)
 
     # TODO due to backward compatibility with high level interface spec
-    for parameter in kernel_info.kernel_selection_tree.get_selection_parameter_list():
+    for parameter in kernel.kernel_selection_tree.get_selection_parameter_list():
         if parameter.name not in parameters_to_ignore:
             parameters.append(parameter.name)
 
diff --git a/python/pystencils_walberla/templates/Boundary.tmpl.h b/python/pystencils_walberla/templates/Boundary.tmpl.h
index 75e3cd13abdacdcd06ea7ba784552b66daa21b63..704a72274d802f3afe388bf992ba802b19afcc9b 100644
--- a/python/pystencils_walberla/templates/Boundary.tmpl.h
+++ b/python/pystencils_walberla/templates/Boundary.tmpl.h
@@ -19,6 +19,7 @@
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 {% if target is equalto 'cpu' -%}
 #include "field/GhostLayerField.h"
diff --git a/src/blockforest/communication/NonUniformPackInfo.h b/src/blockforest/communication/NonUniformPackInfo.h
index 73c3f760fbfb54b3af1be35fdd2d633e3495269e..cd9b20725d6816e4ba21c51b49dbf1f3d091e086 100644
--- a/src/blockforest/communication/NonUniformPackInfo.h
+++ b/src/blockforest/communication/NonUniformPackInfo.h
@@ -52,28 +52,28 @@ public:
    /**
     * Should return true if the amount of data that is packed for a given block in direction
     * "dir" is guaranteed to remain constant over time. False otherwise.
-    * If you are not sure what to return, return false! Returning false is always save.
+    * If you are not sure what to return, return false! Returning false is always safe.
     * Falsely return true will lead to errors! However, if the data can be guaranteed to remain
     * constant over time, returning true enables performance optimizations during the communication.
     */
    virtual bool constantDataExchange() const = 0;
 
    /**
-    * Must return false if calling unpackData and/or communicateLocal is not thread-safe.
+    * Must return false if calling `unpackData*()` and/or `communicateLocal*()` methods is not thread-safe.
     * True otherwise.
-    * If you are not sure what to return, return false! Returning false is always save.
-    * Falsely return true will most likely lead to errors! However, if both unpackData AND
-    * communicateLocal are thread-safe, returning true can lead to performance improvements.
+    * If you are not sure what to return, return false! Returning false is always safe.
+    * Falsely return true will most likely lead to errors! However, if both `unpackData*()` AND
+    * `communicateLocal*()` are thread-safe, returning true can lead to performance improvements.
     */
    virtual bool threadsafeReceiving() const = 0;
 
-   /// Must be thread-safe! Calls packDataImpl.
+   /// Must be thread-safe! Calls \ref packDataEqualLevelImpl.
    inline void packDataEqualLevel( const Block * sender, stencil::Direction dir, mpi::SendBuffer & buffer ) const;
 
-   /// If NOT thread-safe, threadsafeReceiving must return false!
+   /// If NOT thread-safe, \ref threadsafeReceiving must return false!
    virtual void unpackDataEqualLevel( Block * receiver, stencil::Direction dir, mpi::RecvBuffer & buffer ) = 0;
 
-   /// If NOT thread-safe, threadsafeReceiving must return false!
+   /// If NOT thread-safe, \ref threadsafeReceiving must return false!
    virtual void communicateLocalEqualLevel( const Block * sender, Block * receiver, stencil::Direction dir ) = 0;
 
    inline  void packDataCoarseToFine        ( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir, mpi::SendBuffer & buffer ) const;
diff --git a/src/blockforest/communication/UniformToNonUniformPackInfoAdapter.h b/src/blockforest/communication/UniformToNonUniformPackInfoAdapter.h
index c9c6b895c47977aaadd7ae2d5f9640051a5a83e9..f0e5838b4f8879019d2ede96538db39dc6182860 100644
--- a/src/blockforest/communication/UniformToNonUniformPackInfoAdapter.h
+++ b/src/blockforest/communication/UniformToNonUniformPackInfoAdapter.h
@@ -32,7 +32,7 @@ namespace communication {
 
 //*******************************************************************************************************************
 /*! 
- * Adapter to use a UniformPackInfo in a NonUniformBufferedScheme. No communication between coarse <-> fine blocks
+ * Adapter to use a \ref communication::UniformPackInfo in a \ref NonUniformBufferedScheme. No communication between coarse <-> fine blocks
  * happens.
  */
 //*******************************************************************************************************************
@@ -51,25 +51,25 @@ public:
    /**
    * Should return true if the amount of data that is packed for a given block in direction
    * "dir" is guaranteed to remain constant over time. False otherwise.
-   * If you are not sure what to return, return false! Returning false is always save.
+   * If you are not sure what to return, return false! Returning false is always safe.
    * Falsely return true will lead to errors! However, if the data can be guaranteed to remain
    * constant over time, returning true enables performance optimizations during the communication.
    */
    virtual bool constantDataExchange() const { return uniformPackInfo_->constantDataExchange(); }
 
    /**
-   * Must return false if calling unpackData and/or communicateLocal is not thread-safe.
+   * Must return false if calling `unpackData*()` and/or `communicateLocal*()` methods is not thread-safe.
    * True otherwise.
-   * If you are not sure what to return, return false! Returning false is always save.
-   * Falsely return true will most likely lead to errors! However, if both unpackData AND
-   * communicateLocal are thread-safe, returning true can lead to performance improvements.
+   * If you are not sure what to return, return false! Returning false is always safe.
+   * Falsely return true will most likely lead to errors! However, if both `unpackData*()` AND
+   * `communicateLocal*()` are thread-safe, returning true can lead to performance improvements.
    */
    virtual bool threadsafeReceiving() const { return uniformPackInfo_->threadsafeReceiving(); }
 
-   /// If NOT thread-safe, threadsafeReceiving must return false!
+   /// If NOT thread-safe, \ref threadsafeReceiving must return false!
    virtual void unpackDataEqualLevel( Block * receiver, stencil::Direction dir, mpi::RecvBuffer & buffer ) { uniformPackInfo_->unpackData( receiver, dir, buffer ); }
 
-   /// If NOT thread-safe, threadsafeReceiving must return false!
+   /// If NOT thread-safe, \ref threadsafeReceiving must return false!
    virtual void communicateLocalEqualLevel( const Block * sender, Block * receiver, stencil::Direction dir ) { uniformPackInfo_->communicateLocal( sender, receiver, dir ); }
 
    virtual void unpackDataCoarseToFine( Block * /*fineReceiver*/, const BlockID & /*coarseSender*/, stencil::Direction /*dir*/, mpi::RecvBuffer & /*buffer*/ ) { }
diff --git a/src/communication/UniformPackInfo.h b/src/communication/UniformPackInfo.h
index aa110f9bdf5c51b37a57572cfbc800b004ab37b6..168ce9685473619a897e0ebb85677a7f10f66cee 100644
--- a/src/communication/UniformPackInfo.h
+++ b/src/communication/UniformPackInfo.h
@@ -35,18 +35,21 @@ namespace communication {
 
 
 /**
- * \brief UniformPackInfo encapsulates information on how to extract data from blocks,
- * that should be communicated (see packData() ) to neighboring blocks
- * and how to inject this data in a receiving block (see unpackData() )
+ * \brief Data packing/unpacking for ghost layer based communication of a field.
  *
- * Another special method exists for communication between two blocks,
- * which are allocated on the same
- * process. In this case the data does not have be communicated via a buffer,
+ * Encapsulate information on how to extract data from blocks that should be
+ * communicated to neighboring blocks (see \ref packData())
+ * and how to inject this data in a receiving block (see \ref unpackData()).
+ * This involves a memory buffer and two memory copy operations.
+ *
+ * A special method exists for communication between two blocks which are
+ * allocated on the same process (see \ref communicateLocal()).
+ * In this case the data does not have be communicated via a buffer,
  * but can be copied directly.
  *
  * Data that is packed in direction "dir" at one block is unpacked in
  * direction "stencil::inverseDir[dir]" at the neighboring block. This
- * behavior must be implemented in "communicateLocal"!
+ * behavior must be implemented in \ref communicateLocal()!
  *
  * \ingroup communication
  */
@@ -65,23 +68,25 @@ public:
    /**
     * Should return true if the amount of data that is packed for a given block in direction
     * "dir" is guaranteed to remain constant over time. False otherwise.
-    * If you are not sure what to return, return false! Returning false is always save.
-    * Falsely return true will lead to errors! However, if the data can be guaranteed to remain
+    * If you are not sure what to return, return false! Returning false is always safe.
+    * Falsely returning true will lead to errors! However, if the data can be guaranteed to remain
     * constant over time, returning true enables performance optimizations during the communication.
     */
    virtual bool constantDataExchange() const = 0;
 
    /**
-    * Must return false if calling unpackData and/or communicateLocal is not thread-safe.
+    * Must return false if calling \ref unpackData and/or \ref communicateLocal is not thread-safe.
     * True otherwise.
-    * If you are not sure what to return, return false! Returning false is always save.
-    * Falsely return true will most likely lead to errors! However, if both unpackData AND
-    * communicateLocal are thread-safe, returning true can lead to performance improvements.
+    * If you are not sure what to return, return false! Returning false is always safe.
+    * Falsely returning true will most likely lead to errors! However, if both \ref unpackData AND
+    * \ref communicateLocal are thread-safe, returning true can lead to performance improvements.
     */
    virtual bool threadsafeReceiving() const = 0;
 
    /**
-    * Packs data from a block into a send buffer. Must be thread-safe! Calls packDataImpl.
+    * \brief Pack data from a block into a send buffer.
+    *
+    * Must be thread-safe! Calls \ref packDataImpl.
     *
     * @param sender     the block whose data should be packed into a buffer
     * @param dir        pack data for neighbor in this direction
@@ -91,19 +96,21 @@ public:
    inline void packData( const IBlock * sender, stencil::Direction dir, mpi::SendBuffer & buffer ) const;
 
    /**
-    * Unpacks received Data.
-    * If NOT thread-safe, threadsafeReceiving must return false!
+    * \brief Unpack received Data.
+    *
+    * If NOT thread-safe, \ref threadsafeReceiving must return false!
     *
     * @param receiver the block where the unpacked data should be stored into
     * @param dir      receive data from neighbor in this direction
-    * @param buffer
+    * @param buffer   buffer for reading the data from
     */
    virtual void unpackData( IBlock * receiver, stencil::Direction dir, mpi::RecvBuffer & buffer ) = 0;
 
    /**
-    * Function to copy data from one local block to another local block.
+    * \brief Copy data from one local block to another local block.
+    *
     * Both blocks are allocated on the current process.
-    * If NOT thread-safe, threadsafeReceiving must return false!
+    * If NOT thread-safe, \ref threadsafeReceiving must return false!
     *
     * @param sender    id of block where the data should be copied from
     * @param receiver  id of block where the data should be copied to
@@ -134,7 +141,9 @@ public:
 protected:
 
    /**
-    * Packs data from a block into a send buffer. Must be thread-safe!
+    * \brief Pack data from a block into a send buffer.
+    *
+    * Must be thread-safe!
     *
     * @param sender     the block whose data should be packed into a buffer
     * @param dir        pack data for neighbor in this direction
diff --git a/src/core/DataTypes.h b/src/core/DataTypes.h
index 4e7c019a86dcf0ce23fb7c8a9e66e6add8500bde..d6147c12be61a072f112e49de6e68001a6013abc 100644
--- a/src/core/DataTypes.h
+++ b/src/core/DataTypes.h
@@ -167,6 +167,7 @@ using real_t = double;
 using real_t = float;
 #endif
 
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
 /// Half precision support. Experimental. Use carefully.
 ///
 /// This feature is experimental, since it strictly depends on the underlying architecture and compiler support.
@@ -174,7 +175,6 @@ using real_t = float;
 /// interchange. Arithmetic operations will likely involve casting to fp32 (C++ float) and truncation to fp16.
 /// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future.
 ///
-#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
 /// FIXME: (not really right) Clang version must be 15 or higher for x86 half precision support.
 /// FIXME: (not really right) GCC version must be 12 or higher for x86 half precision support.
 /// FIXME: (I don't know) Also support seems to require SSE, so ensure that respective instruction sets are enabled.
@@ -202,7 +202,7 @@ using half    = _Float16;
 // Another possible half precision format would be the one from Google Brain (bfloat16) with an 8 bit exponent and a 7 bit mantissa.
 // Compare https://i10git.cs.fau.de/ab04unyc/walberla/-/issues/23
 using float16 = half;
-#endif
+#endif // WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
 using float32 = float;
 using float64 = double;
 
@@ -276,7 +276,7 @@ inline bool floatIsEqual( walberla::float16 lhs, walberla::float16 rhs, const wa
    const auto difference = lhs - rhs;
    return ( (difference < 0) ? -difference : difference ) < epsilon;
 }
-#endif
+#endif // WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
 
 } // namespace walberla
 
diff --git a/src/core/mpi/CMakeLists.txt b/src/core/mpi/CMakeLists.txt
index 13cc3653bd41f03e1840abd09f71147eee3c57cd..2987e28c266bce6552659cce89f5ef9546927a76 100644
--- a/src/core/mpi/CMakeLists.txt
+++ b/src/core/mpi/CMakeLists.txt
@@ -19,11 +19,14 @@ target_sources( core
       MPIIO.h
       MPIManager.cpp
       MPIManager.h
+      MPIOperation.h
       MPITextFile.cpp
       MPITextFile.h
+      MPIWrapper.cpp
       MPIWrapper.h
       OpenMPBufferSystem.h
       OpenMPBufferSystem.impl.h
+      Operation.h
       RecvBuffer.h
       Reduce.h
       SendBuffer.h
diff --git a/src/core/mpi/Datatype.h b/src/core/mpi/Datatype.h
index f717cb6d94c661aec320a864c972dbba4a49d2ae..ac933bbe8c1e572520d0e67f75f8199c1adc8910 100644
--- a/src/core/mpi/Datatype.h
+++ b/src/core/mpi/Datatype.h
@@ -22,6 +22,7 @@
 #pragma once
 
 #include "MPIWrapper.h"
+#include "core/Abort.h"
 
 
 namespace walberla {
@@ -43,6 +44,21 @@ namespace mpi {
          WALBERLA_MPI_SECTION() { MPI_Type_commit( &mpiDatatype_ ); }
       }
 
+      explicit Datatype(const uint_t byteSize) : mpiDatatype_(MPI_DATATYPE_NULL)
+      {
+         WALBERLA_MPI_SECTION()
+         {
+            if (MPI_Type_contiguous(int_c(byteSize), MPI_BYTE, &mpiDatatype_) != MPI_SUCCESS)
+            {
+               WALBERLA_ABORT("MPI_Type_contiguous " << typeid(mpiDatatype_).name() << " failed.");
+            }
+            if (MPI_Type_commit(&mpiDatatype_) != MPI_SUCCESS)
+            {
+               WALBERLA_ABORT("MPI_Type_commit " << typeid(mpiDatatype_).name() << " failed.");
+            }
+         }
+      }
+
       void init( MPI_Datatype datatype )
       {
          mpiDatatype_ = datatype;
diff --git a/src/core/mpi/MPIManager.cpp b/src/core/mpi/MPIManager.cpp
index a334bc16c4878cea58a1452cd083fba31d9d3c7e..c25ca1082277d89c8be486ae7a9c61350d6bfea0 100644
--- a/src/core/mpi/MPIManager.cpp
+++ b/src/core/mpi/MPIManager.cpp
@@ -119,6 +119,10 @@ void MPIManager::finalizeMPI()
 {
    WALBERLA_MPI_SECTION()
    {
+      /// Free the custom types and operators
+      customMPITypes_.clear();
+      customMPIOperations_.clear();
+
       if (isMPIInitialized_ && !currentlyAborting_)
       {
          isMPIInitialized_ = false;
diff --git a/src/core/mpi/MPIManager.h b/src/core/mpi/MPIManager.h
index 9ba3fb4d04b8f6b0c7f1e2041454677b9156bf75..60ce4d8514e57bb2465a14dfa304cab73b3b8be6 100644
--- a/src/core/mpi/MPIManager.h
+++ b/src/core/mpi/MPIManager.h
@@ -18,23 +18,28 @@
 //! \author Florian Schornbaum <florian.schornbaum@fau.de>
 //! \author Martin Bauer <martin.bauer@fau.de>
 //! \author Christian Godenschwager <christian.godenschwager@fau.de>
+//! \author Michael Zikeli <michael.zikeli@fau.de>
 //
 //======================================================================================================================
 
 #pragma once
 
-#include "MPIWrapper.h"
 #include "core/DataTypes.h"
 #include "core/debug/Debug.h"
 #include "core/math/Uint.h"
+#include "core/mpi/Datatype.h"
+#include "core/mpi/MPIOperation.h"
+#include "core/mpi/MPIWrapper.h"
+#include "core/mpi/Operation.h"
 #include "core/singleton/Singleton.h"
 
+#include <map>
+#include <typeindex>
 
 namespace walberla {
 namespace mpi {
 
 
-
 /**
  * Encapsulates MPI Rank/Communicator information
  *
@@ -127,6 +132,87 @@ public:
    /// Indicates whether MPI-IO can be used with the current MPI communicator; certain versions of OpenMPI produce
    /// segmentation faults when using MPI-IO with a 3D Cartesian MPI communicator (see waLBerla issue #73)
    bool isCommMPIIOValid() const;
+
+   /// Return the custom MPI_Datatype stored in 'customMPITypes_' and defined by the user and passed to 'commitCustomType'.
+   template< typename CType >
+   MPI_Datatype getCustomType() const
+   {
+      WALBERLA_MPI_SECTION()
+      {
+         return customMPITypes_.at(typeid(CType)).operator MPI_Datatype();
+      }
+      WALBERLA_NON_MPI_SECTION()
+      {
+         WALBERLA_ABORT( "This should not be called, if waLBerla is compiled without MPI." );
+      }
+   }
+
+   /// Return the custom MPI_Op stored in 'customMPIOperation_' and defined by the user and passed to 'commitCustomOperation'.
+   template< typename CType >
+   MPI_Op getCustomOperation(mpi::Operation op) const
+   {
+      // FIXME the operation is actually type dependent but implementing this is not straightforward,
+      //    compare comment at declaration of 'customMPIOperations_'.
+      WALBERLA_MPI_SECTION()
+      {
+         return customMPIOperations_.at(op).operator MPI_Op();
+      }
+      WALBERLA_NON_MPI_SECTION()
+      {
+         WALBERLA_ABORT( "This should not be called, if waLBerla is compiled without MPI." );
+      }
+   }
+   //@}
+   //*******************************************************************************************************************
+
+   //** Setter Functions  **********************************************************************************************
+   /*! \name Setter Function */
+   //@{
+   ///! \brief Initializes a custom MPI_Datatype and logs it in the customMPITypes_ map.
+   ///! \param argument The argument that is expected by the constructor of mpi::Datatype
+   ///     At the point of creation 26.01.2024 this is either MPI_Datatype or const int.
+   template < typename CType, class ConstructorArgumentType >
+   void commitCustomType( ConstructorArgumentType& argument )
+   {
+      WALBERLA_MPI_SECTION()
+      {
+         if (isMPIInitialized_ && !currentlyAborting_)
+         {
+            static_assert( std::is_same_v<ConstructorArgumentType, const int> || std::is_same_v<ConstructorArgumentType, MPI_Datatype>,
+                           "mpi::Datatype has only an constructor for an int value or a MPI_Datatype." );
+            [[maybe_unused]] auto worked = std::get< 1 >( customMPITypes_.try_emplace(typeid(CType), argument) );
+            WALBERLA_ASSERT(worked, "Emplacement of type " << typeid(CType).name() << " did not work.");
+         } else {
+            WALBERLA_ABORT( "MPI must be initialized before an new MPI_Datatype can be committed." );
+         }
+      }
+      WALBERLA_NON_MPI_SECTION()
+      {
+         WALBERLA_ABORT( "This should not be called, if waLBerla is compiled without MPI." );
+      }
+   }
+
+   ///! \brief Initializes a custom MPI_Op and logs it in the customMPIOperation map
+   ///! \param op  A operator, e.g. SUM, MIN.
+   ///! \param fct The definition of the MPI_User_function used for this operator.
+   template< typename CType >
+   void commitCustomOperation( mpi::Operation op, MPI_User_function* fct )
+   {
+      WALBERLA_MPI_SECTION()
+      {
+         if (isMPIInitialized_ && !currentlyAborting_)
+         {
+            [[maybe_unused]] auto worked = std::get< 1 >(customMPIOperations_.try_emplace(op, fct));
+            WALBERLA_ASSERT(worked, "Emplacement of operation " << typeid(op).name() << " of type "
+                                                                << typeid(CType).name() << " did not work.");
+         }
+         else { WALBERLA_ABORT("MPI must be initialized before an new MPI_Op can be committed."); }
+      }
+      WALBERLA_NON_MPI_SECTION()
+      {
+         WALBERLA_ABORT( "This should not be called, if waLBerla is compiled without MPI." );
+      }
+   }
    //@}
    //*******************************************************************************************************************
 
@@ -163,6 +249,33 @@ private:
    // Singleton
    MPIManager() : comm_(MPI_COMM_NULL) { WALBERLA_NON_MPI_SECTION() { rank_ = 0; } }
 
+/// It is possible to commit own datatypes to MPI, that are not part of the standard. One example would be float16.
+/// With these maps, it is possible to track self defined MPI_Datatypes and MPI_Ops, to access them at any time and
+/// place in the program, also, they are automatically freed once MPIManager::finalizeMPI is called.
+/// To initialize types or operations and add them to the map, the getter functions 'commitCustomType' and
+/// 'commitCustomOperation' should be used. This can for example be done e.g. in the specialization of the MPITrait of
+/// the newly defined type. For an example see MPIWrapper.cpp
+   std::map< std::type_index, walberla::mpi::Datatype > customMPITypes_{};
+   std::map< walberla::mpi::Operation, walberla::mpi::MPIOperation > customMPIOperations_{};
+   // FIXME this must be type specific as well, but doing so is a bit more complicated.
+   //  1. Idea) Combining both maps together e.g. as std::map< typeid(CType),
+   //                                                          std::pair< MPI_DataType,
+   //                                                                     std::map< Operation,
+   //                                                                               MPI_Op > > > customMPITypesWithOps_{};
+   //           There the access is quite nasty to look at, but easily doable, the construction however is quite difficult
+   //           also one needs to make sure that the type is initialized before the operation.
+   //  2. Idea) Leaving it as two maps customMPITypes_ and customMPIOperations,
+   //           but storing a pair of typeid and operation as key for the operation map.
+   //           This way everything would look nice, but someone needs to implement a comparison operator for this pair.
+   //           I personally don't know where to put this comparison operator to, since it should not be part of the manager.
+   //  3. Idea) Since this relies on the use of MPITrait<CType> --> { MPI_Datatype, MPI_Op } someone could define a object
+   //           to store in the MPIManager there, to keep the MPIManager light and easily understandable.
+   //           I'm also not sure if the MPITrait is the right spot for this though.
+   //  For more information about the changes done in the code to allow custom defined types and operations,
+   //  check out MR !647 ( https://i10git.cs.fau.de/walberla/walberla/-/merge_requests/647 )
+
+
+
 }; // class MPIManager
 
 
diff --git a/src/core/mpi/MPIOperation.h b/src/core/mpi/MPIOperation.h
new file mode 100644
index 0000000000000000000000000000000000000000..3da98bfe8ea2bdb8783b02e997cdee486bb46a0d
--- /dev/null
+++ b/src/core/mpi/MPIOperation.h
@@ -0,0 +1,64 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file MPIOperation.h
+//! \ingroup core
+//! \author Michael Zikeli <michael.zikeli@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "core/Abort.h"
+#include "core/mpi/MPIWrapper.h"
+
+namespace walberla::mpi{
+
+//*******************************************************************************************************************
+/*! RAII class for MPI operators that commits and frees them
+*
+*/
+//*******************************************************************************************************************
+class MPIOperation
+{
+ public:
+   MPIOperation() = delete;
+
+   explicit MPIOperation( MPI_User_function* fct ) : mpiOperation_( MPI_OP_NULL )
+   {
+      WALBERLA_MPI_SECTION() {
+         if ( MPI_Op_create( fct, true, &mpiOperation_) != MPI_SUCCESS )
+         {
+            WALBERLA_ABORT("MPI_Op_create for " << typeid(mpiOperation_).name() << " failed." );
+         }
+      } // WALBERLA_MPI_SECTIION
+   }
+
+   ~MPIOperation()
+   {
+      WALBERLA_MPI_SECTION() { MPI_Op_free( & mpiOperation_ ); }
+   }
+
+   operator MPI_Op() const
+   {
+      return mpiOperation_;
+   }
+
+ protected:
+   MPI_Op mpiOperation_;
+};
+
+
+} // namespace walberla::mpi
\ No newline at end of file
diff --git a/src/core/mpi/MPIWrapper.cpp b/src/core/mpi/MPIWrapper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d68668676c932de2c564dd37312ca3645104e45
--- /dev/null
+++ b/src/core/mpi/MPIWrapper.cpp
@@ -0,0 +1,142 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file MPIWrapper.cpp
+//! \ingroup core
+//! \author Michael Zikeli <michael.zikeli@fau.de>
+//
+//======================================================================================================================
+
+#include "MPIWrapper.h"
+
+#include <set>
+
+#include "MPIManager.h"
+
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+namespace walberla
+{
+
+namespace mpi
+{
+namespace
+{
+/// These functions than can be used by self defined mpi operations, e.g. by using CustomMPIOperation.
+using float16_t = walberla::float16;
+// The signature of MPI_User_function looks like this
+// typedef void MPI_User_function( void *invec, void *inoutvec, int *len, MPI_Datatype *datatype);
+
+void sum(void* mpiRHSArray, void* mpiLHSArray, int* len, MPI_Datatype*)
+{
+   // cast mpi type to target c++ type
+   auto* rhs = (float16_t*) mpiRHSArray;
+   auto* lhs = (float16_t*) mpiLHSArray;
+   for (int i = 0; i < *len; ++i)
+   {
+      *lhs += *rhs;
+   }
+}
+
+void min(void* mpiRHSArray, void* mpiLHSArray, int* len, MPI_Datatype*)
+{
+   // cast mpi type to target c++ type
+   auto* rhs = (float16_t*) mpiRHSArray;
+   auto* lhs = (float16_t*) mpiLHSArray;
+   for (int i = 0; i < *len; ++i)
+   {
+      *lhs = (*rhs >= *lhs) ? *lhs : *rhs;
+   }
+}
+
+void max(void* mpiRHSArray, void* mpiLHSArray, int* len, MPI_Datatype*)
+{
+   // cast mpi type to target c++ type
+   auto* rhs = (float16_t*) mpiRHSArray;
+   auto* lhs = (float16_t*) mpiLHSArray;
+   for (int i = 0; i < *len; ++i)
+   {
+      *lhs = (*rhs <= *lhs) ? *lhs : *rhs;
+   }
+}
+
+MPI_User_function* returnMPIUserFctPointer(const Operation op)
+{
+   switch (op)
+   {
+   case SUM:
+      return &sum;
+   case MIN:
+      return &min;
+   case MAX:
+      return &max;
+   default:
+      WALBERLA_ABORT("The chosen operation " << typeid(op).name() << " is not implemented for float16 yet.");
+   }
+}
+
+}
+}
+
+/// Here some MPI_Datatypes and MPI_Ops are initialized that are not part of the MPI Standard and therefore have to be
+/// define yourself. This is done in the MPIManager, since they need to be freed before MPIFinalize is called and this
+/// way it is easiest to keep track of them.
+///     For more information about this feature compare MR !647 (
+///     https://i10git.cs.fau.de/walberla/walberla/-/merge_requests/647 )
+
+/*!
+ *  \brief Specialization of MPITrait for float16
+ *
+ *  The initialization of the self defined MPI_Datatype and MPI_Op is done in the MPIManager so that it can be freed
+ * before MPI is finalized.
+ */
+MPI_Datatype MPITrait< walberla::float16 >::type()
+{
+
+#ifdef WALBERLA_BUILD_WITH_MPI
+   // Since this type should be created only once, a static variable is used as safeguard.
+   static bool initializedType = false;
+   if (!initializedType)
+   {
+      // Since float16 consists of two Bytes, a continuous datatype with size of two byte is created.
+      mpi::MPIManager::instance()->commitCustomType< walberla::float16, const int >(2);
+      initializedType = true;
+   }
+   return mpi::MPIManager::instance()->getCustomType< walberla::float16 >();
+#else
+   return mpistubs::MPI_FLOAT16;
+#endif
+}
+
+MPI_Op MPITrait< walberla::float16 >::operation(const mpi::Operation& op)
+{
+   WALBERLA_MPI_SECTION()
+   {
+      // mpi::Operation is an enum and not an enum class, thus, it is not sufficient to make a just a bool variable as
+      // safeguard, since all operations are of type mpi::Operation and only the first one would pass the safeguard.
+      // Therefore, a set is created and each operation that is called the first time, will be initialized.
+      static std::set< mpi::Operation > operationInitializationRegister;
+      const bool needsInitialization = std::get< 1 >(operationInitializationRegister.emplace(op));
+      if (needsInitialization)
+      {
+         mpi::MPIManager::instance()->commitCustomOperation< walberla::float16 >(
+            op, mpi::returnMPIUserFctPointer(op));
+      }
+      return MPIManager::instance()->getCustomOperation< walberla::float16 >(op);
+   }
+   WALBERLA_NON_MPI_SECTION() { WALBERLA_ABORT("If MPI is not used, a custom operator should never be called."); }
+}
+
+} // namespace walberla
+#endif // WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
diff --git a/src/core/mpi/MPIWrapper.h b/src/core/mpi/MPIWrapper.h
index eecee3136c702a61e7987d25705076a7c780a635..baf0c62d8ba65bd7bd91a665d5d5261e8323d778 100644
--- a/src/core/mpi/MPIWrapper.h
+++ b/src/core/mpi/MPIWrapper.h
@@ -23,8 +23,7 @@
 #pragma once
 
 #include "core/Abort.h"
-
-
+#include "core/mpi/Operation.h"
 
 /// \cond internal
 
@@ -47,6 +46,7 @@
 
 #endif
 
+
 namespace walberla {
 namespace mpistubs {
     //empty namespace which can be used
@@ -77,8 +77,6 @@ namespace mpistubs {
 #pragma warning ( pop )
 #endif
 
-
-
 #else // WALBERLA_BUILD_WITH_MPI
 
 
@@ -143,6 +141,10 @@ const MPI_Datatype MPI_UNSIGNED_LONG_LONG = 10;
 const MPI_Datatype MPI_FLOAT              = 11;
 const MPI_Datatype MPI_DOUBLE             = 12;
 const MPI_Datatype MPI_LONG_DOUBLE        = 13;
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+   const MPI_Datatype MPI_FLOAT16  = 14;
+#endif
+
 
 const int MPI_ORDER_C       = 0;
 const int MPI_ORDER_FORTRAN = 0;
@@ -151,16 +153,17 @@ const MPI_Datatype MPI_ANY_SOURCE = -2;
 const MPI_Datatype MPI_ANY_TAG    = -1;
 const MPI_Datatype MPI_DATATYPE_NULL = 0;
 
-const MPI_Op MPI_MIN  = 100;
-const MPI_Op MPI_MAX  = 101;
-const MPI_Op MPI_SUM  = 102;
-const MPI_Op MPI_PROD = 103;
-const MPI_Op MPI_LAND = 104;
-const MPI_Op MPI_BAND = 105;
-const MPI_Op MPI_LOR  = 106;
-const MPI_Op MPI_BOR  = 107;
-const MPI_Op MPI_LXOR = 108;
-const MPI_Op MPI_BXOR = 109;
+const MPI_Op MPI_OP_NULL = 99;
+const MPI_Op MPI_MIN     = 100;
+const MPI_Op MPI_MAX     = 101;
+const MPI_Op MPI_SUM     = 102;
+const MPI_Op MPI_PROD    = 103;
+const MPI_Op MPI_LAND    = 104;
+const MPI_Op MPI_BAND    = 105;
+const MPI_Op MPI_LOR     = 106;
+const MPI_Op MPI_BOR     = 107;
+const MPI_Op MPI_LXOR    = 108;
+const MPI_Op MPI_BXOR    = 109;
 
 const int MPI_PACKED = 1;
 const int MPI_UNDEFINED = -1;
@@ -265,6 +268,7 @@ inline int MPI_Type_get_extent(MPI_Datatype, MPI_Aint*, MPI_Aint*) { WALBERLA_MP
 inline int MPI_Type_create_struct(int, const int[], const MPI_Aint[], const MPI_Datatype[], MPI_Datatype*) { WALBERLA_MPI_FUNCTION_ERROR }
 
 inline int MPI_Op_create(MPI_User_function*, int, MPI_Op*) { WALBERLA_MPI_FUNCTION_ERROR }
+inline int MPI_Op_free(MPI_Op*) { WALBERLA_MPI_FUNCTION_ERROR }
 
 inline int MPI_Get_processor_name( char*, int* ) { WALBERLA_MPI_FUNCTION_ERROR }
 
@@ -307,58 +311,104 @@ namespace mpi {
 
 
 
+
+
 /*!\class MPITrait
 // \brief Base template for the MPITrait class
 //
 // The MPITrait class template offers a translation between the C++ built-in data types and
-// the corresponding MPI data types required for send and receive operations. For a particular
-// MPITrait instantiation, the corresponding MPI data type can be obtained by calling type()
-// of the MPITrait. The following example demonstrates the application of the MPITrait class:
+// the corresponding MPI data types its respective operation required for send, receive and reduce operations.
+// For a particular MPITrait instantiation, the corresponding MPI data type can be obtained by calling type()
+// as well as calling operation( const Operation& ) to the MPI operation corresponding to the MPI data type.
+// The following example demonstrates the application of the MPITrait class:
 
-   \code
+\code
    // Initialization of the MPI communication
    int* pi;  // Integer buffer for the MPI send operation
-   ...       // Initialization of the send buffer
+...       // Initialization of the send buffer
 
    // Sending 50 integers to process 0
    MPI_Send( pi, 50, MPITrait< int >::type(), 0, 0, MPI_COMM_WORLD );
-   \endcode
-*/
+\endcode
+      */
 template< typename T >
-struct MPITrait;
-
-
-
-/// Macro for the creation of MPITrait specializations for the supported data types.
+struct MPITrait
+{
+   static inline MPI_Datatype type();
+   static inline MPI_Op operation(const mpi::Operation&      );
+};
 
-#define WALBERLA_CREATE_MPITRAIT_SPECIALIZATION(CPP_TYPE,MPI_TYPE) \
+/// Macro for specialization of the MPI supported data types in MPITrait::type().
+#define WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(CPP_TYPE, MPI_TYPE) \
    template<> \
-   struct MPITrait< CPP_TYPE > \
+   inline MPI_Datatype MPITrait< CPP_TYPE >::type() \
    { \
-      static inline MPI_Datatype type() { return (MPI_TYPE); } \
+      return (MPI_TYPE); \
    }
 
-
-
 // MPITRAIT SPECIALIZATIONS
 
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( char               , MPI_CHAR               );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( signed char        , MPI_CHAR               );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( signed short int   , MPI_SHORT              );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( signed int         , MPI_INT                );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( signed long int    , MPI_LONG               );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( signed long long   , MPI_LONG_LONG          );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned char      , MPI_UNSIGNED_CHAR      );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned short int , MPI_UNSIGNED_SHORT     );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned int       , MPI_UNSIGNED           );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned long int  , MPI_UNSIGNED_LONG      );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned long long , MPI_UNSIGNED_LONG_LONG );
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(char, MPI_CHAR)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(signed char, MPI_CHAR)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(signed short int, MPI_SHORT)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(signed int, MPI_INT)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(signed long int, MPI_LONG)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(signed long long, MPI_LONG_LONG)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(unsigned char, MPI_UNSIGNED_CHAR)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(unsigned short int, MPI_UNSIGNED_SHORT)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(unsigned int, MPI_UNSIGNED)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(unsigned long int, MPI_UNSIGNED_LONG)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(unsigned long long, MPI_UNSIGNED_LONG_LONG)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(float, MPI_FLOAT)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(double, MPI_DOUBLE)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(long double, MPI_LONG_DOUBLE)
 #ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
-   WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( walberla::float16  , MPI_WCHAR              );
+template<> struct MPITrait< float16 >{
+   static MPI_Datatype type();
+   static MPI_Op operation(const mpi::Operation&      );
+};
+#endif
+
+
+/*!
+ *  \brief Specialization of the static operation() method of MPITrait.
+ *
+ *  It chooses a MPI_Op depending on the value type of the object the operation is performed on.
+ *
+ *  \param op The operation to be performed (op is an element of the enum mpi::Operation).
+ */
+template< typename T >
+MPI_Op MPITrait< T >::operation(const mpi::Operation& op)
+{
+   switch (op)
+   {
+   case mpi::MIN:
+      return MPI_MIN;
+   case mpi::MAX:
+      return MPI_MAX;
+   case mpi::SUM:
+      return MPI_SUM;
+   case mpi::PRODUCT:
+      return MPI_PROD;
+   case mpi::LOGICAL_AND:
+      return MPI_LAND;
+   case mpi::BITWISE_AND:
+      return MPI_BAND;
+   case mpi::LOGICAL_OR:
+      return MPI_LOR;
+   case mpi::BITWISE_OR:
+      return MPI_BOR;
+   case mpi::LOGICAL_XOR:
+      return MPI_LXOR;
+   case mpi::BITWISE_XOR:
+      return MPI_BXOR;
+   default:
+      WALBERLA_ABORT("Unknown operation!");
+   }
+#ifdef __IBMCPP__
+   return MPI_SUM; // never reached, helps to suppress a warning from the IBM compiler
 #endif
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( float              , MPI_FLOAT              );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( double             , MPI_DOUBLE             );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( long double        , MPI_LONG_DOUBLE        );
+}
 
 } // namespace walberla
 /// \endcond
diff --git a/src/core/mpi/Operation.h b/src/core/mpi/Operation.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3097bb89fe4f3fcb3f4581d8435afe2b2b70ec8
--- /dev/null
+++ b/src/core/mpi/Operation.h
@@ -0,0 +1,27 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Operation.h
+//! \ingroup core
+//! \author Michael Zikeli <michael.zikeli@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+namespace walberla::mpi
+{
+// Note: I don't like at all that this is an enum and not an enum class, but changing this would be a major change in the framework.
+enum Operation { MIN, MAX, SUM, PRODUCT, LOGICAL_AND, BITWISE_AND, LOGICAL_OR, BITWISE_OR, LOGICAL_XOR, BITWISE_XOR };
+} // namespace walberla::mpi
\ No newline at end of file
diff --git a/src/core/mpi/Reduce.h b/src/core/mpi/Reduce.h
index 5e9bb8220112ff4bff9c19a02e66cb3c2d801d46..a0b6edb39cad16fdfa5c527f83f9fe007ba9fa2c 100644
--- a/src/core/mpi/Reduce.h
+++ b/src/core/mpi/Reduce.h
@@ -16,18 +16,19 @@
 //! \file Reduce.h
 //! \ingroup core
 //! \author Christian Godenschwager <christian.godenschwager@fau.de>
+//! \author Michael Zikeli <michael.zikeli@fau.de>
 //
 //======================================================================================================================
 
 #pragma once
 
-#include "BufferDataTypeExtensions.h"
-
 #include "core/Abort.h"
 #include "core/DataTypes.h"
 #include "core/debug/Debug.h"
-#include "core/mpi/MPIManager.h"
 #include "core/mpi/MPIWrapper.h"
+#include "core/mpi/Operation.h"
+
+#include "BufferDataTypeExtensions.h"
 
 #include "core/math/Vector3.h"
 
@@ -36,33 +37,10 @@
 
 
 namespace walberla {
-namespace mpi {
 
 
-
-enum Operation { MIN, MAX, SUM, PRODUCT, LOGICAL_AND, BITWISE_AND, LOGICAL_OR, BITWISE_OR, LOGICAL_XOR, BITWISE_XOR };
-
-inline MPI_Op toMPI_Op( Operation operation )
+namespace mpi
 {
-   switch( operation )
-   {
-   case MIN:         return MPI_MIN;
-   case MAX:         return MPI_MAX;
-   case SUM:         return MPI_SUM;
-   case PRODUCT:     return MPI_PROD;
-   case LOGICAL_AND: return MPI_LAND;
-   case BITWISE_AND: return MPI_BAND;
-   case LOGICAL_OR:  return MPI_LOR;
-   case BITWISE_OR:  return MPI_BOR;
-   case LOGICAL_XOR: return MPI_LXOR;
-   case BITWISE_XOR: return MPI_BXOR;
-   default:          WALBERLA_ABORT( "Unknown operation!" );
-   }
-#ifdef __IBMCPP__
-   return MPI_SUM; // never reached, helps to suppress a warning from the IBM compiler
-#endif
-}
-
 //======================================================================================================================
 /*!
  *  \brief Reduces a value over all processes in-place
@@ -91,11 +69,11 @@ void reduceInplace( T & value, Operation operation, int recvRank = 0, MPI_Comm c
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( MPI_IN_PLACE, &value, 1, MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( MPI_IN_PLACE, &value, 1, MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
    else
    {
-      MPI_Reduce( &value, nullptr, 1, MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( &value, nullptr, 1, MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
 }
 
@@ -128,11 +106,11 @@ inline void reduceInplace( bool & value, Operation operation, int recvRank = 0,
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( MPI_IN_PLACE, &intValue, 1, MPITrait<int>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( MPI_IN_PLACE, &intValue, 1, MPITrait<int>::type(), MPITrait<int>::operation(operation), recvRank, comm );
    }
    else
    {
-      MPI_Reduce( &intValue, nullptr, 1, MPITrait<int>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( &intValue, nullptr, 1, MPITrait<int>::type(), MPITrait<int>::operation(operation), recvRank, comm );
    }
 
    value = intValue != 0;
@@ -172,11 +150,11 @@ T reduce( const T value, Operation operation, int recvRank = 0, MPI_Comm comm =
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( const_cast<T*>( &value ), &result, 1, MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( const_cast<T*>( &value ), &result, 1, MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
    else
    {
-      MPI_Reduce( const_cast<T*>( &value ), nullptr, 1, MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( const_cast<T*>( &value ), nullptr, 1, MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
 
    return result;
@@ -213,11 +191,11 @@ inline bool reduce( const bool value, Operation operation, int recvRank = 0, MPI
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( &intValue, &intResult, 1, MPITrait<int>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( &intValue, &intResult, 1, MPITrait<int>::type(), MPITrait<int>::operation(operation), recvRank, comm );
    }
    else
    {
-      MPI_Reduce( &intValue, nullptr, 1, MPITrait<int>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( &intValue, nullptr, 1, MPITrait<int>::type(), MPITrait<int>::operation(operation), recvRank, comm );
    }
 
    return intResult != 0;
@@ -252,11 +230,11 @@ void reduceInplace( std::vector<T> & values, Operation operation, int recvRank =
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( MPI_IN_PLACE, values.empty() ? nullptr : &values[0], int_c( values.size() ), MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( MPI_IN_PLACE, values.empty() ? nullptr : &values[0], int_c( values.size() ), MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
    else
    {
-      MPI_Reduce( values.empty() ? nullptr : &values[0], nullptr, int_c( values.size() ), MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( values.empty() ? nullptr : &values[0], nullptr, int_c( values.size() ), MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
 }
 
@@ -292,14 +270,14 @@ inline void reduceInplace( std::vector<bool> & values, Operation operation, int
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( MPI_IN_PLACE, sendBuffer.empty() ? nullptr : &sendBuffer[0], int_c( sendBuffer.size() ), MPITrait<uint8_t>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( MPI_IN_PLACE, sendBuffer.empty() ? nullptr : &sendBuffer[0], int_c( sendBuffer.size() ), MPITrait<uint8_t>::type(), MPITrait<uint8_t>::operation(operation), recvRank, comm );
       size_t size = values.size();
       convert( sendBuffer, values );
       values.resize(size);
    }
    else
    {
-      MPI_Reduce( sendBuffer.empty() ? nullptr : &sendBuffer[0], nullptr, int_c( sendBuffer.size() ), MPITrait<uint8_t>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( sendBuffer.empty() ? nullptr : &sendBuffer[0], nullptr, int_c( sendBuffer.size() ), MPITrait<uint8_t>::type(), MPITrait<uint8_t>::operation(operation), recvRank, comm );
    }
 }
 
@@ -331,11 +309,11 @@ void reduceInplace( math::Vector3<T> & values, Operation operation, int recvRank
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( MPI_IN_PLACE, values.data(), 3, MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( MPI_IN_PLACE, values.data(), 3, MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
    else
    {
-      MPI_Reduce( values.data(), nullptr, 3, MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( values.data(), nullptr, 3, MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
 }
 
@@ -367,11 +345,11 @@ inline void reduceInplace( math::Vector3<bool> & values, Operation operation, in
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( MPI_IN_PLACE, intValues.data(), 3, MPITrait<int>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( MPI_IN_PLACE, intValues.data(), 3, MPITrait<int>::type(), MPITrait<int>::operation(operation), recvRank, comm );
    }
    else
    {
-      MPI_Reduce( intValues.data(), nullptr, 3, MPITrait<int>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( intValues.data(), nullptr, 3, MPITrait<int>::type(), MPITrait<int>::operation(operation), recvRank, comm );
    }
 
    for(uint_t i = 0; i < 3; ++i)
@@ -411,11 +389,11 @@ math::Vector3<T> reduce( const math::Vector3<T> & values, Operation operation, i
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( const_cast<T*>( values.data() ), result.data(), 3, MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( const_cast<T*>( values.data() ), result.data(), 3, MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
    else
    {
-      MPI_Reduce( const_cast<T*>( values.data() ), nullptr, 3, MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( const_cast<T*>( values.data() ), nullptr, 3, MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
 
    return result;
@@ -452,14 +430,14 @@ inline math::Vector3<bool> reduce( const math::Vector3<bool> & values, Operation
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( MPI_IN_PLACE, intValues.data(), 3, MPITrait<int>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( MPI_IN_PLACE, intValues.data(), 3, MPITrait<int>::type(), MPITrait<int>::operation(operation), recvRank, comm );
 
       for(uint_t i = 0; i < 3; ++i)
          results[i] = intValues[i] != 0;
    }
    else
    {
-      MPI_Reduce( intValues.data(), nullptr, 3, MPITrait<int>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( intValues.data(), nullptr, 3, MPITrait<int>::type(), MPITrait<int>::operation(operation), recvRank, comm );
    }
 
    return results;
@@ -487,7 +465,7 @@ T allReduce( const T & value, Operation operation, MPI_Comm comm = MPI_COMM_WORL
    WALBERLA_NON_MPI_SECTION() { return value; }
 
    T result;
-   MPI_Allreduce( const_cast<T*>( &value ), &result, 1, MPITrait<T>::type(), toMPI_Op(operation), comm );
+   MPI_Allreduce( const_cast<T*>( &value ), &result, 1, MPITrait<T>::type(), MPITrait<T>::operation(operation), comm );
    return result;
 }
 
@@ -514,7 +492,7 @@ inline bool allReduce( const bool value, Operation operation, MPI_Comm comm = MP
 
    int intValue = value ? 1 : 0;
 
-   MPI_Allreduce( MPI_IN_PLACE, &intValue, 1, MPITrait<int>::type(), toMPI_Op(operation), comm );
+   MPI_Allreduce( MPI_IN_PLACE, &intValue, 1, MPITrait<int>::type(), MPITrait<int>::operation(operation), comm );
 
    return intValue != 0;
 }
@@ -539,7 +517,7 @@ void allReduceInplace( T & value, Operation operation, MPI_Comm comm = MPI_COMM_
 
    WALBERLA_NON_MPI_SECTION() { return; }
 
-   MPI_Allreduce( MPI_IN_PLACE, &value, 1, MPITrait<T>::type(), toMPI_Op(operation), comm );
+   MPI_Allreduce( MPI_IN_PLACE, &value, 1, MPITrait<T>::type(), MPITrait<T>::operation(operation), comm );
 }
 
 
@@ -562,7 +540,7 @@ inline void allReduceInplace( bool & value, Operation operation, MPI_Comm comm =
 
    int intValue = value ? 1 : 0;
 
-   MPI_Allreduce( MPI_IN_PLACE, &intValue, 1, MPITrait<int>::type(), toMPI_Op(operation), comm );
+   MPI_Allreduce( MPI_IN_PLACE, &intValue, 1, MPITrait<int>::type(), MPITrait<int>::operation(operation), comm );
 
    value = intValue != 0;
 }
@@ -587,7 +565,7 @@ void allReduceInplace( std::vector<T> & values, Operation operation, MPI_Comm co
 
    WALBERLA_NON_MPI_SECTION() { return; }
 
-   MPI_Allreduce( MPI_IN_PLACE, values.empty() ? nullptr : &values[0], int_c( values.size() ), MPITrait<T>::type(), toMPI_Op(operation), comm );
+   MPI_Allreduce( MPI_IN_PLACE, values.empty() ? nullptr : &values[0], int_c( values.size() ), MPITrait<T>::type(), MPITrait<T>::operation(operation), comm );
 }
 
 
@@ -612,7 +590,7 @@ inline void allReduceInplace( std::vector<bool> & bools, Operation operation, MP
    std::vector<uint8_t> sendBuffer;
 
    convert( bools, sendBuffer );
-   MPI_Allreduce( MPI_IN_PLACE, sendBuffer.empty() ? nullptr : &sendBuffer[0], int_c( sendBuffer.size() ), MPITrait<uint8_t>::type(), toMPI_Op(operation), comm );
+   MPI_Allreduce( MPI_IN_PLACE, sendBuffer.empty() ? nullptr : &sendBuffer[0], int_c( sendBuffer.size() ), MPITrait<uint8_t>::type(), MPITrait<uint8_t>::operation(operation), comm );
    auto size = bools.size();
    convert(sendBuffer, bools);
    bools.resize(size);
@@ -637,7 +615,7 @@ void allReduceInplace( math::Vector3<T> & values, Operation operation, MPI_Comm
 
    WALBERLA_NON_MPI_SECTION() { return; }
 
-   MPI_Allreduce( MPI_IN_PLACE, values.data(), 3, MPITrait<T>::type(), toMPI_Op(operation), comm );
+   MPI_Allreduce( MPI_IN_PLACE, values.data(), 3, MPITrait<T>::type(), MPITrait<T>::operation(operation), comm );
 }
 
 
@@ -663,7 +641,7 @@ inline void allReduceInplace( math::Vector3<bool> & bools, Operation operation,
 
    math::Vector3<int> intValues{bools[0] ? 1 : 0, bools[1] ? 1 : 0, bools[2] ? 1 : 0};
 
-   MPI_Allreduce( MPI_IN_PLACE, intValues.data(), 3, MPITrait<int>::type(), toMPI_Op(operation), comm );
+   MPI_Allreduce( MPI_IN_PLACE, intValues.data(), 3, MPITrait<int>::type(), MPITrait<int>::operation(operation), comm );
 
    for(uint_t i = 0; i < 3; ++i)
    {
diff --git a/src/core/timing/Timer.h b/src/core/timing/Timer.h
index 9f7c3f97d1066ff7ffa322cb5d6550c9f5d5013b..efc7016409a6e91d8c83b96aebfbd9fc40c4a8cf 100644
--- a/src/core/timing/Timer.h
+++ b/src/core/timing/Timer.h
@@ -30,6 +30,7 @@
 #include "WcPolicy.h"
 
 #include "core/DataTypes.h"
+#include "core/mpi/MPIManager.h"
 #include "core/mpi/RecvBuffer.h"
 #include "core/mpi/Reduce.h"
 #include "core/mpi/SendBuffer.h"
@@ -527,7 +528,7 @@ shared_ptr<Timer<TP> > getReduced( Timer<TP>& timer, ReduceType rt, int targetRa
    }
 
    //uint_t counter, double min, double max, double total, double sumOfSquares
-   if ( targetRank < 0 || targetRank == MPIManager::instance()->worldRank() )
+   if ( targetRank < 0 || targetRank == mpi::MPIManager::instance()->worldRank() )
       return make_shared<Timer<TP> >( mpi::MPIManager::instance()->numProcesses(), min, max, total, sumOfSquares  );
 
    return nullptr;
diff --git a/src/domain_decomposition/BlockStorage.h b/src/domain_decomposition/BlockStorage.h
index b59f7b30fe803a2c9abbe48c8856c9b4bbf2be20..2f29acdb7fd45c1e287ebf6d4601b641f303506b 100644
--- a/src/domain_decomposition/BlockStorage.h
+++ b/src/domain_decomposition/BlockStorage.h
@@ -723,6 +723,12 @@ inline void BlockStorage::clearBlockData( const BlockDataID & id )
 {
    for( auto block = begin(); block != end(); ++block )
       block->deleteData( id );
+
+   //also delete block data from data handling vector
+   auto elementToErase = std::remove_if(blockDataItem_.begin(), blockDataItem_.end(),
+                                 [id](const internal::BlockDataItem& dataItem)
+                                 { return dataItem.getId() == id; });
+   blockDataItem_.erase(elementToErase, blockDataItem_.end());
 }
 
 
diff --git a/src/field/Field.h b/src/field/Field.h
index 9a8e33c3ae9ba9aabec6c0f67f6503dbf6990c3b..30fe15586846f09c540f12cf309c36eb05135e05 100644
--- a/src/field/Field.h
+++ b/src/field/Field.h
@@ -262,7 +262,8 @@ namespace field {
       cell_idx_t yOff() const { return yOff_; }
       cell_idx_t zOff() const { return zOff_; }
 
-      bool coordinatesValid( cell_idx_t x, cell_idx_t y, cell_idx_t z, cell_idx_t f ) const;
+      bool coordinatesValid( cell_idx_t x, cell_idx_t y, cell_idx_t z, cell_idx_t f = 0 ) const;
+      bool coordinatesValid( const Cell & c, cell_idx_t f = 0 ) const { return coordinatesValid(c[0], c[1], c[2], f); };
       //@}
       //****************************************************************************************************************
 
diff --git a/src/field/distributors/KernelDistributor.h b/src/field/distributors/KernelDistributor.h
index 712d9a7b7e32ae96b0d259873d102166675b22b3..bc0abb8fa5882e51960d936132cd3613d55f5172 100644
--- a/src/field/distributors/KernelDistributor.h
+++ b/src/field/distributors/KernelDistributor.h
@@ -62,7 +62,7 @@ public:
       WALBERLA_ASSERT(baseField.nrOfGhostLayers() > uint_t(0), "field for kernel distribution needs at least one ghost layer");
    }
 
-   inline bool operator==( const OwnType & other ){ return baseField_ == other.baseField_; }
+   inline bool operator==( const OwnType & other ) const { return baseField_ == other.baseField_; }
 
    template< typename ForwardIterator_T >
    inline void distribute( const Vector3<real_t> & position, ForwardIterator_T distributeValueBegin )
diff --git a/src/field/distributors/NearestNeighborDistributor.h b/src/field/distributors/NearestNeighborDistributor.h
index 932f443b3e30af9ab6a2bc4f6c6f3f585e485473..c4819cb9119d46e46dbc34bed40228664bb5cffb 100644
--- a/src/field/distributors/NearestNeighborDistributor.h
+++ b/src/field/distributors/NearestNeighborDistributor.h
@@ -59,7 +59,7 @@ public:
    : blockStorage_( blockStorage ), block_( block ), baseField_( baseField ), flagField_( flagField ), evaluationMask_( evaluationMask )
    {}
 
-   inline bool operator==( const OwnType & other ){ return baseField_ == other.baseField_; }
+   inline bool operator==( const OwnType & other ) const { return baseField_ == other.baseField_; }
 
    template< typename ForwardIterator_T >
    inline void distribute( const Vector3<real_t> & position, ForwardIterator_T distributeValueBegin )
diff --git a/src/field/interpolators/KernelFieldInterpolator.h b/src/field/interpolators/KernelFieldInterpolator.h
index 0e59fabf21dfd4899e62ca29961811a0e99e59d9..0f5987e76e844df9505869fbaab3feadacbc88fa 100644
--- a/src/field/interpolators/KernelFieldInterpolator.h
+++ b/src/field/interpolators/KernelFieldInterpolator.h
@@ -105,7 +105,7 @@ public:
    }
 
 
-   inline bool operator==( const OwnType & other ){ return baseField_ == other.baseField_; }
+   inline bool operator==( const OwnType & other ) const { return baseField_ == other.baseField_; }
 
    template< typename ForwardIterator_T >
    inline void get( const Vector3<real_t> & position, ForwardIterator_T interpolationResultBegin )
diff --git a/src/field/interpolators/NearestNeighborFieldInterpolator.h b/src/field/interpolators/NearestNeighborFieldInterpolator.h
index b5b5cba7f65a3e06517356933d157108ba81e90c..bb08276f9f12949a10461e96b1a815acd64f2559 100644
--- a/src/field/interpolators/NearestNeighborFieldInterpolator.h
+++ b/src/field/interpolators/NearestNeighborFieldInterpolator.h
@@ -57,7 +57,7 @@ public:
    {}
 
 
-   inline bool operator==( const OwnType & other ){ return baseField_ == other.baseField_; }
+   inline bool operator==( const OwnType & other ) const { return baseField_ == other.baseField_; }
 
    template< typename ForwardIterator_T >
    inline void get( const Vector3<real_t> & position, ForwardIterator_T interpolationResultBegin )
diff --git a/src/field/interpolators/TrilinearFieldInterpolator.h b/src/field/interpolators/TrilinearFieldInterpolator.h
index e9809d835f1bf67e89da8a0eb2b5a9493624c84e..351ed7afea09db1c75feeb59dd771afb0796949e 100644
--- a/src/field/interpolators/TrilinearFieldInterpolator.h
+++ b/src/field/interpolators/TrilinearFieldInterpolator.h
@@ -62,7 +62,7 @@ public:
    }
 
 
-   inline bool operator==( const OwnType & other ){ return baseField_ == other.baseField_; }
+   inline bool operator==( const OwnType & other ) const { return baseField_ == other.baseField_; }
 
    template< typename ForwardIterator_T >
    inline void get( const Vector3<real_t> & position, ForwardIterator_T interpolationResultBegin )
diff --git a/src/gpu/FieldAccessor.h b/src/gpu/FieldAccessor.h
index cd50cc58d6e1c6ef708a1cc50e7fbcc897933281..d737983d1aa5f289d624cd508eea8a7969a5fe69 100644
--- a/src/gpu/FieldAccessor.h
+++ b/src/gpu/FieldAccessor.h
@@ -31,6 +31,13 @@ namespace gpu
 
 
 
+   /**
+    * \brief Handle to the underlying device data of a \ref GPUField.
+    *
+    * Encapsulate the device memory pointer and offsets necessary
+    * to calculate the address of a cell from a GPU kernel's thread
+    * coordinates in the thread block.
+    */
    template<typename T>
    class FieldAccessor
    {
@@ -78,7 +85,7 @@ namespace gpu
       __device__ __forceinline__ bool isValidPosition()  { return true; }
 
       __device__ T & get()       { return * (T*)(ptr_);                }
-      __device__ T & get( int f) { return * (T*)(ptr_ + f * fOffset_); }
+      __device__ T & get( uint_t f) { return * (T*)(ptr_ + f * fOffset_); }
 
 
       __device__ T & getNeighbor( int cx, int cy, int cz ) const
@@ -88,7 +95,7 @@ namespace gpu
                                cz * zOffset_ );
       }
 
-      __device__ T & getNeighbor( int cx, int cy, int cz, int cf )
+      __device__ T & getNeighbor( int cx, int cy, int cz, uint_t cf )
       {
          return * (T*)( ptr_ + cx * xOffset_ +
                                cy * yOffset_ +
diff --git a/src/gpu/FieldIndexing.h b/src/gpu/FieldIndexing.h
index 51b337e61237690ddc5163113abeb47ee44691b1..a06c95087898b3c705548311a7d9810e63519e78 100644
--- a/src/gpu/FieldIndexing.h
+++ b/src/gpu/FieldIndexing.h
@@ -44,6 +44,14 @@ namespace gpu
 template< typename T >
 class GPUField;
 
+/**
+ * \brief Utility class to generate handles to the underlying device data of a \ref GPUField.
+ *
+ * Pre-calculate memory offsets of a \ref GPUField for a given slice,
+ * cell interval, or the entire grid with or without the ghost layer,
+ * and store them in a \ref FieldAccessor handle.
+ * That handle is obtained by calling \ref gpuAccess().
+ */
 template< typename T >
 class FieldIndexing
 {
diff --git a/src/gpu/GPUField.h b/src/gpu/GPUField.h
index f8a0242ed3aa5e9de3606d8ff1737b4fe869f42f..7d004c76203060c5fb77c350f306007a091ca0c9 100755
--- a/src/gpu/GPUField.h
+++ b/src/gpu/GPUField.h
@@ -45,16 +45,20 @@ namespace gpu
    *  Basically a wrapper around a CUDA/HIP device pointer together with size information about the field
    *  i.e. sizes in x,y,z,f directions and number of ghost layers.
    *
-   *  Internally represented by a gpuPitchedPtr which is allocated with gpuMalloc3D to take padding of the
-   *  innermost coordinate into account.
+   *  Internally represented by a \c gpuPitchedPtr which is allocated with extra padding for the
+   *  innermost coordinate.
+   *  Pitched memory is a type of non-linear memory where padding is introduced
+   *  to optimize data alignment and thus reduce data access latency,
+   *  for example by avoiding shared memory bank conflicts.
    *
    *  Supports Array-of-Structures (AoS,zyxf) layout and Structure-of-Arrays (SoA, fzyx) layout, in a similar way
-   *  to field::Field
+   *  to \ref field::Field
    *
-   *  To work with the GPUField look at the gpu::fieldCpy functions to transfer a field::Field to a gpu::GPUField
+   *  To work with the \ref gpu::GPUField look at the \ref gpu::fieldCpy functions to transfer a \ref field::Field to a \ref gpu::GPUField
    *  and vice versa.
-   *  When writing device kernels for GPUFields have a look at the FieldIndexing and FieldAccessor concepts.
-   *  These simplify the "iteration" i.e. indexing of cells in GPUFields.
+   *
+   *  When writing device kernels for a \ref GPUField, have a look at the \ref FieldIndexing and \ref FieldAccessor concepts.
+   *  These simplify the "iteration" i.e. indexing of cells in a \ref GPUField.
    */
    //*******************************************************************************************************************
    template<typename T>
diff --git a/src/gpu/GPURAII.h b/src/gpu/GPURAII.h
index 815b3829114506a8c601669aa4195461bd60151a..6bcfd7811b4e68f591f3dfe2597111139ec9906f 100644
--- a/src/gpu/GPURAII.h
+++ b/src/gpu/GPURAII.h
@@ -13,7 +13,7 @@
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
-//! \file CudaRAII.h
+//! \file GPURAII.h
 //! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
diff --git a/src/gpu/communication/GPUPackInfo.h b/src/gpu/communication/GPUPackInfo.h
index c34600f29b2219088c29b0d5ff2e9fb1dc4a1142..f700c372924a310e3ba816164692b1b6650c13d5 100644
--- a/src/gpu/communication/GPUPackInfo.h
+++ b/src/gpu/communication/GPUPackInfo.h
@@ -47,10 +47,28 @@ namespace walberla::gpu::communication {
 
 
 /**
- * Data packing/unpacking for ghost layer based communication of a gpu::GPUField
+ * \brief Data packing/unpacking for ghost layer based communication of a \ref GPUField.
+ *
+ * Encapsulate information on how to extract data from blocks that should be
+ * communicated to neighboring blocks (see \ref packDataImpl())
+ * and how to inject this data in a receiving block (see \ref unpackData()).
+ * This involves a host memory buffer and two device-to-host memory copy operations.
+ *
+ * A special method exists for communication between two blocks which are
+ * allocated on the same process (see \ref communicateLocal()).
+ * In this case the data does not have be communicated via a host buffer,
+ * but can be sent directly. This involves a single device-to-device memory
+ * copy operation.
+ *
+ * Data that is packed in direction "dir" at one block is unpacked in
+ * direction "stencil::inverseDir[dir]" at the neighboring block.
+ * This behavior must be implemented in \ref communicateLocal()!
+ *
+ * See \ref MemcpyPackInfo for a more efficient packing/unpacking method
+ * where the buffer is stored in device memory rather than in host memory.
+ *
  * \ingroup gpu
- * Template Parameters:
- *    - GPUField_T   A fully qualified GPUField.
+ * \tparam GPUField_T   A fully qualified \ref GPUField.
  */
 template<typename GPUField_T>
 class GPUPackInfo : public walberla::communication::UniformPackInfo
diff --git a/src/gpu/communication/GeneratedGPUPackInfo.h b/src/gpu/communication/GeneratedGPUPackInfo.h
index f5f6c98b60b529045a1877a435fcacacb9359a95..4b905ad63fa8aca23006d8b5ac2a2a09ab30078a 100644
--- a/src/gpu/communication/GeneratedGPUPackInfo.h
+++ b/src/gpu/communication/GeneratedGPUPackInfo.h
@@ -28,14 +28,59 @@
 
 namespace walberla::gpu {
 
+/**
+ * \brief Data packing/unpacking for ghost layer based communication of a \ref GPUField.
+ *
+ * Encapsulate information on how to extract data from blocks that should be
+ * communicated to neighboring blocks (see \ref pack())
+ * and how to inject this data in a receiving block (see \ref unpack()).
+ * This involves a memory buffer and two memory copy operations.
+ *
+ * A special method exists for communication between two blocks which are
+ * allocated on the same process (see \ref communicateLocal()).
+ * In this case the data does not have be communicated via a buffer,
+ * but can be copied directly.
+ *
+ * Data that is packed in direction "dir" at one block is unpacked in
+ * direction "stencil::inverseDir[dir]" at the neighboring block. This
+ * behavior must be implemented in \ref communicateLocal()!
+ *
+ * \ingroup gpu
+ */
 class GeneratedGPUPackInfo
 {
 public:
   GeneratedGPUPackInfo() = default;
   virtual ~GeneratedGPUPackInfo() = default;
 
+   /**
+    * \brief Pack data from a block into a send buffer.
+    *
+    * \param dir        pack data for neighbor in this direction
+    * \param buffer     buffer for writing the data into
+    * \param block      the block whose data should be packed into a buffer
+    * \param stream     GPU stream
+    */
    virtual void pack  ( stencil::Direction dir, unsigned char *buffer, IBlock *block, gpuStream_t stream ) = 0;
+   /**
+    * \brief Copy data from one local block to another local block.
+    *
+    * Both blocks are allocated on the same MPI rank.
+    *
+    * \param dir       the direction of the communication (from sender to receiver)
+    * \param sender    id of block where the data should be copied from
+    * \param receiver  id of block where the data should be copied to
+    * \param stream     GPU stream
+    */
    virtual void communicateLocal  ( stencil::Direction dir, const IBlock *sender, IBlock *receiver, gpuStream_t stream ) = 0;
+   /**
+    * \brief Unpack data from a receive buffer into a block.
+    *
+    * \param dir        receive data from neighbor in this direction
+    * \param buffer     buffer for reading the data from
+    * \param block      the block where the unpacked data should be stored into
+    * \param stream     GPU stream
+    */
    virtual void unpack( stencil::Direction dir, unsigned char *buffer, IBlock *block, gpuStream_t stream ) = 0;
    virtual uint_t size( stencil::Direction dir, IBlock *block ) = 0;
 };
diff --git a/src/gpu/communication/MemcpyPackInfo.h b/src/gpu/communication/MemcpyPackInfo.h
index 6c15988f4f2687275fea7f0f8be36b2e7d99fcf6..c55c3394193afb67244e93843cf9d11d12fca7dd 100644
--- a/src/gpu/communication/MemcpyPackInfo.h
+++ b/src/gpu/communication/MemcpyPackInfo.h
@@ -13,6 +13,27 @@
 
 namespace walberla::gpu::communication {
 
+/**
+ * \brief Data packing/unpacking for ghost layer based communication of a \ref GPUField.
+ *
+ * Encapsulate information on how to extract data from blocks that should be
+ * communicated to neighboring blocks (see \ref pack())
+ * and how to inject this data in a receiving block (see \ref unpack()).
+ * This involves a device memory buffer and two device-to-device memory copy operations.
+ *
+ * A special method exists for communication between two blocks which are
+ * allocated on the same process (see \ref communicateLocal()).
+ * In this case the data does not have be communicated via a device buffer,
+ * but can be sent directly. This involves a single device-to-device memory
+ * copy operation.
+ *
+ * Data that is packed in direction "dir" at one block is unpacked in
+ * direction "stencil::inverseDir[dir]" at the neighboring block.
+ * This behavior must be implemented in \ref communicateLocal()!
+ *
+ * \ingroup gpu
+ * \tparam GPUFieldType   A fully qualified \ref GPUField.
+ */
 template<typename GPUFieldType>
 class MemcpyPackInfo : public ::walberla::gpu::GeneratedGPUPackInfo
 {
diff --git a/src/gpu/communication/NonUniformGPUScheme.h b/src/gpu/communication/NonUniformGPUScheme.h
index 745d28cc5f18e0df1ce6eeeda0cfbf5d478656ee..b872be1d0c80e3537971b49434d64033373a1822 100644
--- a/src/gpu/communication/NonUniformGPUScheme.h
+++ b/src/gpu/communication/NonUniformGPUScheme.h
@@ -46,7 +46,7 @@ namespace walberla::gpu::communication
 template< typename Stencil >
 class NonUniformGPUScheme
 {
-public:
+ public:
    enum INDEX { EQUAL_LEVEL = 0, COARSE_TO_FINE = 1, FINE_TO_COARSE = 2 };
 
    using CpuBuffer_T = walberla::gpu::communication::PinnedMemoryBuffer;
@@ -90,7 +90,7 @@ public:
    inline void waitCommunicateCoarseToFine(uint_t fineLevel);
    inline void waitCommunicateFineToCoarse(uint_t fineLevel);
 
-private:
+ private:
    void setupCommunication();
 
    void init();
@@ -133,17 +133,21 @@ private:
 template< typename Stencil >
 NonUniformGPUScheme< Stencil >::NonUniformGPUScheme(const weak_ptr< StructuredBlockForest >& bf, bool sendDirectlyFromGPU,
                                                     const int tag)
-      : blockForest_(bf), sendFromGPU_(sendDirectlyFromGPU), baseTag_(tag),
-        requiredBlockSelectors_(Set< SUID >::emptySet()), incompatibleBlockSelectors_(Set< SUID >::emptySet())
+   : blockForest_(bf), sendFromGPU_(sendDirectlyFromGPU), baseTag_(tag),
+     requiredBlockSelectors_(Set< SUID >::emptySet()), incompatibleBlockSelectors_(Set< SUID >::emptySet())
 {
    WALBERLA_MPI_SECTION()
-   {
+      {
 // Open MPI supports compile time CUDA-aware support check
 #if (defined(OPEN_MPI) && OPEN_MPI) && !(defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT)
-      WALBERLA_CHECK(!sendDirectlyFromGPU)
+         WALBERLA_CHECK(!sendDirectlyFromGPU)
 #endif
-   }
+      }
    init();
+
+   if(sendFromGPU_){WALBERLA_LOG_DETAIL_ON_ROOT("Using GPU-Direct Communication in NonUniformGPUScheme")}
+   else{WALBERLA_LOG_DETAIL_ON_ROOT("Using Communication via CPU Memory")}
+
 }
 
 template< typename Stencil >
@@ -151,16 +155,18 @@ NonUniformGPUScheme< Stencil >::NonUniformGPUScheme(const weak_ptr< StructuredBl
                                                     const Set< SUID >& requiredBlockSelectors,
                                                     const Set< SUID >& incompatibleBlockSelectors,
                                                     bool sendDirectlyFromGPU, const int tag)
-      : blockForest_(bf), requiredBlockSelectors_(requiredBlockSelectors),
-        incompatibleBlockSelectors_(incompatibleBlockSelectors), sendFromGPU_(sendDirectlyFromGPU), baseTag_(tag)
+   : blockForest_(bf), requiredBlockSelectors_(requiredBlockSelectors),
+     incompatibleBlockSelectors_(incompatibleBlockSelectors), sendFromGPU_(sendDirectlyFromGPU), baseTag_(tag)
 {
    WALBERLA_MPI_SECTION()
-   {
+      {
 #if !(defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT)
-      WALBERLA_CHECK(!sendDirectlyFromGPU)
+         WALBERLA_CHECK(!sendDirectlyFromGPU)
 #endif
-   }
+      }
    init();
+   if(sendFromGPU_){WALBERLA_LOG_DETAIL_ON_ROOT("Using GPU-Direct Communication in NonUniformGPUScheme")}
+   else{WALBERLA_LOG_DETAIL_ON_ROOT("Using Communication via CPU Memory")}
 }
 
 template< typename Stencil >
@@ -212,7 +218,7 @@ void NonUniformGPUScheme< Stencil >::refresh()
 
 #ifndef NDEBUG
    for (auto & packInfo : packInfos_)
-   packInfo->clearBufferSizeCheckMap();
+      packInfo->clearBufferSizeCheckMap();
 #endif
    forestModificationStamp_ = forest->getBlockForest().getModificationStamp();
 }
@@ -307,9 +313,6 @@ void NonUniformGPUScheme< Stencil >::startCommunicationEqualLevel(const uint_t i
       for (auto it : headers_[EQUAL_LEVEL][index])
          bufferSystemGPU_[EQUAL_LEVEL][index].sendBuffer(it.first).clear();
 
-   // wait until communication dependent kernels are finished
-   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
-
    // Start filling send buffers
    for (auto& iBlock : *forest)
    {
@@ -396,10 +399,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t
    else
       bufferSystemCPU_[COARSE_TO_FINE][index].scheduleReceives();
 
-   if (!sendFromGPU_)
-      for (auto it : headers_[COARSE_TO_FINE][index])
-         bufferSystemGPU_[COARSE_TO_FINE][index].sendBuffer(it.first).clear();
-
+   for (auto it : headers_[COARSE_TO_FINE][index]){
+      bufferSystemGPU_[COARSE_TO_FINE][index].sendBuffer(it.first).clear();
+   }
    // wait until communication dependent kernels are finished
    WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
 
@@ -444,24 +446,24 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t
             {
                auto nProcess              = mpi::MPIRank(coarseBlock->getNeighborProcess(neighborIdx, n));
                GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[COARSE_TO_FINE][index].sendBuffer(nProcess);
-               gpuDataBuffer.clear();
                for (auto& pi : packInfos_)
                {
                   WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
                   WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir))
                   if (sendFromGPU_)
                   {
-                     pi->packDataCoarseToFine(coarseBlock, fineReceiverId, *dir, gpuDataBuffer, streams_[*dir]);
+                     pi->packDataCoarseToFine(coarseBlock, fineReceiverId, *dir, gpuDataBuffer, streams_[0]);
                   }
                   else
                   {
+                     gpuDataBuffer.clear();
                      auto gpuDataPtr = gpuDataBuffer.cur();
                      // packDataCoarseToFine moves the pointer with advanceNoResize
-                     pi->packDataCoarseToFine(coarseBlock, fineReceiverId, *dir, gpuDataBuffer, streams_[*dir]);
+                     pi->packDataCoarseToFine(coarseBlock, fineReceiverId, *dir, gpuDataBuffer, streams_[0]);
                      auto size = pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir);
                      auto cpuDataPtr = bufferSystemCPU_[COARSE_TO_FINE][index].sendBuffer(nProcess).advanceNoResize(size);
                      WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
-                     WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[*dir]))
+                     WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[0]))
                   }
                }
             }
@@ -502,9 +504,8 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t
    else
       bufferSystemCPU_[FINE_TO_COARSE][index].scheduleReceives();
 
-   if (!sendFromGPU_)
-      for (auto it : headers_[FINE_TO_COARSE][index])
-         bufferSystemGPU_[FINE_TO_COARSE][index].sendBuffer(it.first).clear();
+   for (auto it : headers_[FINE_TO_COARSE][index])
+      bufferSystemGPU_[FINE_TO_COARSE][index].sendBuffer(it.first).clear();
 
    // wait until communication dependent kernels are finished
    WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
@@ -548,24 +549,24 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t
          {
             auto nProcess              = mpi::MPIRank(fineBlock->getNeighborProcess(neighborIdx, uint_t(0)));
             GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[FINE_TO_COARSE][index].sendBuffer(nProcess);
-            gpuDataBuffer.clear();
             for (auto& pi : packInfos_)
             {
                WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
                WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeFineToCoarseSend(fineBlock, *dir))
                if (sendFromGPU_)
                {
-                  pi->packDataFineToCoarse(fineBlock, coarseReceiverId, *dir, gpuDataBuffer, streams_[*dir]);
+                  pi->packDataFineToCoarse(fineBlock, coarseReceiverId, *dir, gpuDataBuffer, streams_[0]);
                }
                else
                {
+                  gpuDataBuffer.clear();
                   auto gpuDataPtr = gpuDataBuffer.cur();
                   // packDataFineToCoarse moves the pointer with advanceNoResize
-                  pi->packDataFineToCoarse(fineBlock, coarseReceiverId, *dir, gpuDataBuffer, streams_[*dir]);
+                  pi->packDataFineToCoarse(fineBlock, coarseReceiverId, *dir, gpuDataBuffer, streams_[0]);
                   auto size = pi->sizeFineToCoarseSend(fineBlock, *dir);
                   auto cpuDataPtr = bufferSystemCPU_[FINE_TO_COARSE][index].sendBuffer(nProcess).advanceNoResize(size);
                   WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
-                  WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[*dir]))
+                  WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[0]))
                }
             }
          }
@@ -672,7 +673,7 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateCoarseToFine(const uint_t fi
                GpuBuffer_T &gpuDataBuffer = recvInfo.buffer();
                WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
                pi->unpackDataCoarseToFine(fineReceiver, header.senderId, stencil::inverseDir[header.dir],
-                                          gpuDataBuffer, streams_[stencil::inverseDir[header.dir]]);
+                                          gpuDataBuffer, streams_[0]);
             }
          }
       }
@@ -696,8 +697,8 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateCoarseToFine(const uint_t fi
                WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
                WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr)
 
-               WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[stencil::inverseDir[header.dir]]))
-               pi->unpackDataCoarseToFine(fineReceiver, header.senderId, stencil::inverseDir[header.dir], adaptiveGPUBuffer, streams_[stencil::inverseDir[header.dir]]);
+               WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[0]))
+               pi->unpackDataCoarseToFine(fineReceiver, header.senderId, stencil::inverseDir[header.dir], adaptiveGPUBuffer, streams_[0]);
             }
          }
       }
@@ -735,7 +736,7 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateFineToCoarse(const uint_t fi
             {
                GpuBuffer_T& gpuDataBuffer = recvInfo.buffer();
                WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
-               pi->unpackDataFineToCoarse(block, header.senderId, stencil::inverseDir[header.dir], gpuDataBuffer, streams_[stencil::inverseDir[header.dir]]);
+               pi->unpackDataFineToCoarse(block, header.senderId, stencil::inverseDir[header.dir], gpuDataBuffer, streams_[0]);
             }
          }
       }
@@ -759,8 +760,8 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateFineToCoarse(const uint_t fi
                WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
                WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr)
 
-               WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[stencil::inverseDir[header.dir]]))
-               pi->unpackDataFineToCoarse(block, header.senderId, stencil::inverseDir[header.dir], adaptiveGPUBuffer, streams_[stencil::inverseDir[header.dir]]);
+               WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[0]))
+               pi->unpackDataFineToCoarse(block, header.senderId, stencil::inverseDir[header.dir], adaptiveGPUBuffer, streams_[0]);
             }
          }
       }
diff --git a/src/gpu/communication/UniformGPUScheme.h b/src/gpu/communication/UniformGPUScheme.h
index bc481d8950c25d4aa5196316c641e8b67e34318a..183df0497a53e11f2260fc3e591d65462800c036 100644
--- a/src/gpu/communication/UniformGPUScheme.h
+++ b/src/gpu/communication/UniformGPUScheme.h
@@ -42,6 +42,37 @@ namespace communication {
 
 
 
+/**
+ * \brief Communication scheme for buffered communication in uniform block grids.
+ *
+ * Synchronize a set of \ref GPUField between GPU devices.
+ * Communication between fields on the same process: use direct copy
+ * via \ref GeneratedGPUPackInfo::communicateLocal.
+ * Communication between different processes: use a buffered communication scheme;
+ * when multiple fields have been changed they can be synchronized at once,
+ * using one MPI message per communication partner.
+ *
+ *   \code
+ *      UniformGPUScheme<stencil::D3Q19> scheme;  // the stencil defines the communication neighbors
+ *      scheme.addPackInfo( make_shared<gpu::communication::MemcpyPackInfo<FieldType> >( idOfFirstField ) );
+ *      scheme.addPackInfo( make_shared<gpu::communication::MemcpyPackInfo<FieldType> >( idOfSecondField ) );
+ *
+ *      // either synchronous communication...
+ *      scheme();
+ *
+ *      // .. or asynchronous:
+ *      scheme.startCommunication();
+ *      functionWhichDoesNotNeedCommunicatedValues();
+ *      scheme.wait();
+ *   \endcode
+ *
+ * This scheme sends one message per communication step and neighbor device.
+ * Therefore all contents that have to be sent are packed into a single buffer.
+ * Multiple \ref GeneratedGPUPackInfo can be registered to send their contents in a single step.
+ *
+ * When running multiple \ref UniformGPUScheme concurrently, different MPI tags
+ * have to be used for the schemes: the tag can be passed in the constructor.
+ */
    template<typename Stencil>
    class UniformGPUScheme
    {
diff --git a/src/gpu/communication/UniformGPUScheme.impl.h b/src/gpu/communication/UniformGPUScheme.impl.h
index 84d9e0f22dd5661d1d428525d3758a5bb9a29488..0221290f425dec2c0fc07214022cf8d80f079b25 100644
--- a/src/gpu/communication/UniformGPUScheme.impl.h
+++ b/src/gpu/communication/UniformGPUScheme.impl.h
@@ -47,6 +47,8 @@ namespace communication {
          WALBERLA_CHECK(!sendDirectlyFromGPU)
 #endif
       }
+      if(sendFromGPU_){WALBERLA_LOG_DETAIL_ON_ROOT("Using GPU-Direct Communication in UniformGPUScheme")}
+      else{WALBERLA_LOG_DETAIL_ON_ROOT("Using Communication via CPU Memory")}
 
       for (uint_t i = 0; i < Stencil::Q; ++i)
          WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i]))
@@ -75,6 +77,8 @@ namespace communication {
          WALBERLA_CHECK(!sendDirectlyFromGPU)
 #endif
       }
+      if(sendFromGPU_){WALBERLA_LOG_DETAIL_ON_ROOT("Using GPU-Direct Communication in UniformGPUScheme")}
+      else{WALBERLA_LOG_DETAIL_ON_ROOT("Using Communication via CPU Memory")}
 
       for (uint_t i = 0; i < Stencil::Q; ++i)
          WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i]))
diff --git a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h
index 96c514fcc4369084273c098ac9bf4ad21310ae29..585d1db348cbf0e1b5572f563b48ff55a717e9ec 100644
--- a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h
+++ b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h
@@ -291,7 +291,7 @@ class NonuniformGeneratedGPUPdfPackInfo : public walberla::gpu::GeneratedNonUnif
    bool areNeighborsInDirection(const Block* block, const BlockID& neighborID,
                                 Vector3< cell_idx_t > dirVec) const;
 
-   CellInterval intervalHullInDirection(const CellInterval& ci, Vector3< cell_idx_t > tangentialDir,
+   CellInterval intervalHullInDirection(const CellInterval& ci, Vector3< cell_idx_t > dirVec,
                                         cell_idx_t width) const;
    bool skipsThroughCoarseBlock(const Block* block, Direction dir) const;
 
diff --git a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h
index 987cebe9b2bfd343ed0277ed3faefef4dddaa753..7ff9c7fd3cf5383499f51d9b17bc995f45450ef2 100644
--- a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h
+++ b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h
@@ -425,7 +425,7 @@ inline Vector3< cell_idx_t >
 }
 
 /**
- * Returns the part of a cell interval's hull of given width in direction dirVec.
+ * Returns the part of a cell interval's hull of given \p width in direction \p dirVec.
  * @param ci        The original cell interval
  * @param dirVec    Direction Vector
  * @param width     Width of the hull
diff --git a/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h
index 215f5a5c1f5dae1c7b514026bd19def5bff8786f..3abb8a911d68072211d4660a4ee68b24ada22979 100644
--- a/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h
+++ b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h
@@ -178,8 +178,7 @@ std::function<void()>  BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, Bo
 
 
 template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
-void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::ghostLayerPropagation(
-   Block * block)
+void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::ghostLayerPropagation(Block * block)
 {
    auto pdfField = block->getData<PdfField_T>(pdfFieldId_);
 
diff --git a/src/mesa_pd/CMakeLists.txt b/src/mesa_pd/CMakeLists.txt
index 19eef8bf9d7e9858eb03113dba4c20c02782dda5..b71b17698f74b9cfeb176332344f95f8eeb301c4 100644
--- a/src/mesa_pd/CMakeLists.txt
+++ b/src/mesa_pd/CMakeLists.txt
@@ -18,8 +18,8 @@ add_subdirectory( vtk )
 if(OPENMESH_CORE_FOUND)
     set( WALBERLA_MESAPD_CONVEX_POLYHEDRON_AVAILABLE ON CACHE INTERNAL "")
     target_link_libraries( mesa_pd PUBLIC mesh_common )
-    message( STATUS "MESA-PD: ConvexPolyhedron shape is available (OpenMesh dependency satisfied)" )
+    message( DEBUG "MESA-PD: ConvexPolyhedron shape is available (OpenMesh dependency satisfied)" )
 else()
     set( WALBERLA_MESAPD_CONVEX_POLYHEDRON_AVAILABLE OFF CACHE INTERNAL "")
-    message( STATUS "MESA-PD: ConvexPolyhedron shape is unavailable (OpenMesh not found)" )
+    message( DEBUG "MESA-PD: ConvexPolyhedron shape is unavailable (OpenMesh not found)" )
 endif()
\ No newline at end of file
diff --git a/src/pe/raytracing/Intersects.h b/src/pe/raytracing/Intersects.h
index a956e4efe90e78f9c5cd8fbc3a35ba0a92f28060..92adb8f3e51e75cadb09dcfbd6b7ff900d11b56f 100644
--- a/src/pe/raytracing/Intersects.h
+++ b/src/pe/raytracing/Intersects.h
@@ -355,7 +355,10 @@ inline bool intersectsSphere(const Vec3& gpos, real_t radius, const Ray& ray, re
    real_t a = direction * direction;
    real_t b = real_t(2.) * (displacement * direction);
    real_t c = (displacement * displacement) - (radius * radius);
-   real_t discriminant = b*b - real_t(4.)*a*c;
+   // Hotfix for a floating point problem; see https://i10git.cs.fau.de/walberla/walberla/-/issues/243
+   real_t discriminant_tmp1 = b*b;
+   real_t discriminant_tmp2 = -real_t(4.)*a*c;
+   real_t discriminant = discriminant_tmp1 + discriminant_tmp2;
    if (discriminant < 0) {
       // with discriminant smaller than 0, sphere is not hit by ray
       // (no solution for quadratic equation)
diff --git a/src/stencil/D2CornerStencil.h b/src/stencil/D2CornerStencil.h
index 9d1716100079943e2229f7418c0c47c0ad2ebc3d..5888bad3047a336dd2aafba3ca2cf93e607b956d 100644
--- a/src/stencil/D2CornerStencil.h
+++ b/src/stencil/D2CornerStencil.h
@@ -1,10 +1,10 @@
+#pragma once
+
 //====================================================================================================================
 //  Caution: This file has been generated automatically. All manual changes are lost when file is regenerated!
 //           Changes should be done in Stencil.in.h,and then all stencils classes can be generated again.
 //====================================================================================================================
 #ifndef DOXY_SKIP_INTERNAL
-#pragma once
-
 #include "Directions.h"
 #include "Iterator.h"
 
diff --git a/src/stencil/D2Q4.h b/src/stencil/D2Q4.h
index 9c1f9b7b7f6da572bdb1f7d88963ec0ee94d19da..294f3551220fe8ddd3e21b72e276e46254cffa10 100644
--- a/src/stencil/D2Q4.h
+++ b/src/stencil/D2Q4.h
@@ -1,10 +1,10 @@
+#pragma once
+
 //====================================================================================================================
 //  Caution: This file has been generated automatically. All manual changes are lost when file is regenerated!
 //           Changes should be done in Stencil.in.h,and then all stencils classes can be generated again.
 //====================================================================================================================
 #ifndef DOXY_SKIP_INTERNAL
-#pragma once
-
 #include "Directions.h"
 #include "Iterator.h"
 
diff --git a/src/stencil/D2Q5.h b/src/stencil/D2Q5.h
index 98cfbb0099d1cfdbee82a3ff74c5b5b6d50de15c..a09f40cdc1dcdb25be2cc79fde4484fa2cb6c1df 100644
--- a/src/stencil/D2Q5.h
+++ b/src/stencil/D2Q5.h
@@ -1,10 +1,10 @@
+#pragma once
+
 //====================================================================================================================
 //  Caution: This file has been generated automatically. All manual changes are lost when file is regenerated!
 //           Changes should be done in Stencil.in.h,and then all stencils classes can be generated again.
 //====================================================================================================================
 #ifndef DOXY_SKIP_INTERNAL
-#pragma once
-
 #include "Directions.h"
 #include "Iterator.h"
 
diff --git a/src/stencil/D2Q9.h b/src/stencil/D2Q9.h
index 4c3dc0cd9948536a4d13473b57a04dd63f928365..5d55c7fe89b21b6f571faa1920ea476d16ba70c8 100644
--- a/src/stencil/D2Q9.h
+++ b/src/stencil/D2Q9.h
@@ -1,10 +1,10 @@
+#pragma once
+
 //====================================================================================================================
 //  Caution: This file has been generated automatically. All manual changes are lost when file is regenerated!
 //           Changes should be done in Stencil.in.h,and then all stencils classes can be generated again.
 //====================================================================================================================
 #ifndef DOXY_SKIP_INTERNAL
-#pragma once
-
 #include "Directions.h"
 #include "Iterator.h"
 
diff --git a/src/stencil/D3CornerStencil.h b/src/stencil/D3CornerStencil.h
index b234a89d09776919e0c008391cd699ef72a2b9ef..5df290d22b164f08858ed455ed0678f0654e4b17 100644
--- a/src/stencil/D3CornerStencil.h
+++ b/src/stencil/D3CornerStencil.h
@@ -1,10 +1,10 @@
+#pragma once
+
 //====================================================================================================================
 //  Caution: This file has been generated automatically. All manual changes are lost when file is regenerated!
 //           Changes should be done in Stencil.in.h,and then all stencils classes can be generated again.
 //====================================================================================================================
 #ifndef DOXY_SKIP_INTERNAL
-#pragma once
-
 #include "Directions.h"
 #include "Iterator.h"
 
diff --git a/src/stencil/D3EdgeCornerStencil.h b/src/stencil/D3EdgeCornerStencil.h
index e3e6a2832a025c39d9427cd14234ba3910fd5df5..318752ead018b307f59f2fc01f3a6827b6e99284 100644
--- a/src/stencil/D3EdgeCornerStencil.h
+++ b/src/stencil/D3EdgeCornerStencil.h
@@ -1,10 +1,10 @@
+#pragma once
+
 //====================================================================================================================
 //  Caution: This file has been generated automatically. All manual changes are lost when file is regenerated!
 //           Changes should be done in Stencil.in.h,and then all stencils classes can be generated again.
 //====================================================================================================================
 #ifndef DOXY_SKIP_INTERNAL
-#pragma once
-
 #include "Directions.h"
 #include "Iterator.h"
 
diff --git a/src/stencil/D3Q15.h b/src/stencil/D3Q15.h
index 896c8c49b912752204af227920e53e56ba42a798..32dab37b9eba8182d69d8d5873f5c4ecfd1d46e3 100644
--- a/src/stencil/D3Q15.h
+++ b/src/stencil/D3Q15.h
@@ -1,10 +1,10 @@
+#pragma once
+
 //====================================================================================================================
 //  Caution: This file has been generated automatically. All manual changes are lost when file is regenerated!
 //           Changes should be done in Stencil.in.h,and then all stencils classes can be generated again.
 //====================================================================================================================
 #ifndef DOXY_SKIP_INTERNAL
-#pragma once
-
 #include "Directions.h"
 #include "Iterator.h"
 
diff --git a/src/stencil/D3Q19.h b/src/stencil/D3Q19.h
index a7a520f69b4cd2d59e6b03f9ceffa186e5c5bb5a..6077893823241858a2bb90bec3743825869e6f05 100644
--- a/src/stencil/D3Q19.h
+++ b/src/stencil/D3Q19.h
@@ -1,10 +1,10 @@
+#pragma once
+
 //====================================================================================================================
 //  Caution: This file has been generated automatically. All manual changes are lost when file is regenerated!
 //           Changes should be done in Stencil.in.h,and then all stencils classes can be generated again.
 //====================================================================================================================
 #ifndef DOXY_SKIP_INTERNAL
-#pragma once
-
 #include "Directions.h"
 #include "Iterator.h"
 
diff --git a/src/stencil/D3Q27.h b/src/stencil/D3Q27.h
index 6ba403bd579e77ecc6b324538eeaa44e83ad64ad..74b986f4a2ea9fcefdda55a87cf014aef0e9cd4e 100644
--- a/src/stencil/D3Q27.h
+++ b/src/stencil/D3Q27.h
@@ -1,10 +1,10 @@
+#pragma once
+
 //====================================================================================================================
 //  Caution: This file has been generated automatically. All manual changes are lost when file is regenerated!
 //           Changes should be done in Stencil.in.h,and then all stencils classes can be generated again.
 //====================================================================================================================
 #ifndef DOXY_SKIP_INTERNAL
-#pragma once
-
 #include "Directions.h"
 #include "Iterator.h"
 
diff --git a/src/stencil/D3Q6.h b/src/stencil/D3Q6.h
index 8b9f81d3ad1cea8de9ca2991a1de327514d90699..249b4dc5427d9232a9673928be5c1f8c6ea1efc6 100644
--- a/src/stencil/D3Q6.h
+++ b/src/stencil/D3Q6.h
@@ -1,10 +1,10 @@
+#pragma once
+
 //====================================================================================================================
 //  Caution: This file has been generated automatically. All manual changes are lost when file is regenerated!
 //           Changes should be done in Stencil.in.h,and then all stencils classes can be generated again.
 //====================================================================================================================
 #ifndef DOXY_SKIP_INTERNAL
-#pragma once
-
 #include "Directions.h"
 #include "Iterator.h"
 
diff --git a/src/stencil/D3Q7.h b/src/stencil/D3Q7.h
index 8e14c9b001d47f384c466c5bd7e0c9a60775581e..33ecb5b14eb63661ec7bded54da25779ed9a60f6 100644
--- a/src/stencil/D3Q7.h
+++ b/src/stencil/D3Q7.h
@@ -1,10 +1,10 @@
+#pragma once
+
 //====================================================================================================================
 //  Caution: This file has been generated automatically. All manual changes are lost when file is regenerated!
 //           Changes should be done in Stencil.in.h,and then all stencils classes can be generated again.
 //====================================================================================================================
 #ifndef DOXY_SKIP_INTERNAL
-#pragma once
-
 #include "Directions.h"
 #include "Iterator.h"
 
diff --git a/src/stencil/EdgeStencil.h b/src/stencil/EdgeStencil.h
index 1c4982c995ffa520a81cdf22ff0442dc30ff1bd6..e045795309850e97671edbe79a5b0ae3ebc899b9 100644
--- a/src/stencil/EdgeStencil.h
+++ b/src/stencil/EdgeStencil.h
@@ -1,10 +1,10 @@
+#pragma once
+
 //====================================================================================================================
 //  Caution: This file has been generated automatically. All manual changes are lost when file is regenerated!
 //           Changes should be done in Stencil.in.h,and then all stencils classes can be generated again.
 //====================================================================================================================
 #ifndef DOXY_SKIP_INTERNAL
-#pragma once
-
 #include "Directions.h"
 #include "Iterator.h"
 
diff --git a/src/stencil/Stencil.in.h b/src/stencil/Stencil.in.h
index 893ad59ab36fd860e0dc25f1d0a9feb2e97368ac..4443d823585c4b2413dff0d233a040f1b36aa1a7 100644
--- a/src/stencil/Stencil.in.h
+++ b/src/stencil/Stencil.in.h
@@ -1,5 +1,3 @@
-#pragma once
-
 #include "Directions.h"
 #include "Iterator.h"
 
@@ -83,7 +81,7 @@ namespace stencil {
          /*! \name Iteration*/
          //@{
 
-         typedef stencil::Iterator<$name> iterator;
+         using iterator = stencil::Iterator<$name>;
 
          static iterator begin()           { return iterator(0); }
          static iterator beginNoCenter()   { return iterator(noCenterFirstIdx); }
@@ -182,7 +180,7 @@ namespace stencil {
 
    } // namespace internal
 
-   typedef internal::$name<> $name;
+   using $name = internal::$name<>;
 
 } // namespace stencil
 } // namespace walberla
diff --git a/src/stencil/generate.py b/src/stencil/generate.py
index 47291f67de8d838651e1ae7df724c7189aa1ec67..263d7077d2fb56022a20811ae277a025fcbe2abd 100755
--- a/src/stencil/generate.py
+++ b/src/stencil/generate.py
@@ -80,7 +80,9 @@ def coordinateToDirection(coord):
     return directionStr
 
 
-header = """//====================================================================================================================
+header = """#pragma once
+
+//====================================================================================================================
 //  Caution: This file has been generated automatically. All manual changes are lost when file is regenerated!
 //           Changes should be done in Stencil.in.h,and then all stencils classes can be generated again.
 //====================================================================================================================
diff --git a/src/vtk/VTKOutput.cpp b/src/vtk/VTKOutput.cpp
index 345d4fda3c980210f4b035867ea23e001017b915..42d7d9ef434b92b720a95115ed6dbbdd638905e2 100644
--- a/src/vtk/VTKOutput.cpp
+++ b/src/vtk/VTKOutput.cpp
@@ -63,7 +63,7 @@ VTKOutput::VTKOutput( const BlockStorage & bs, const std::string & identifier, c
 VTKOutput::VTKOutput( const StructuredBlockStorage & sbs, const std::string & identifier, const uint_t writeFrequency,
                       const std::string & baseFolder, const std::string & executionFolder,
                       const bool continuousNumbering, const bool binary, const bool littleEndian, const bool useMPIIO,
-                      const uint_t ghostLayers, const bool forcePVTU, const uint_t initialExecutionCount ) :
+                      const uint_t ghostLayers, const bool forcePVTU, const uint_t initialExecutionCount, const bool amrFileFormat, const bool oneFilePerProcess ) :
 
    unstructuredBlockStorage_( &sbs.getBlockStorage() ),
    blockStorage_( &sbs ),
@@ -74,8 +74,29 @@ VTKOutput::VTKOutput( const StructuredBlockStorage & sbs, const std::string & id
    useMPIIO_( useMPIIO ),
    outputDomainDecomposition_( false ),
    samplingDx_( real_c(-1) ), samplingDy_( real_c(-1) ), samplingDz_( real_c(-1) ),
-   forcePVTU_( forcePVTU ), configured_( false ), uniformGrid_( false ), ghostLayers_( ghostLayers ), writeNextStep_( false )
+   forcePVTU_( forcePVTU ), configured_( false ), uniformGrid_( false ), amrFileFormat_(amrFileFormat), oneFilePerProcess_(oneFilePerProcess), ghostLayers_( ghostLayers ), writeNextStep_( false )
 {
+   if(ghostLayers > 0 && oneFilePerProcess_)
+      WALBERLA_LOG_WARNING_ON_ROOT("Writing out ghostlayers is not supported with oneFilePerProcess. The ghostlayers are just dropped. Alternatively MPI-IO could be used to achieve a similar task")
+
+   if (amrFileFormat && oneFilePerProcess)
+   {
+      WALBERLA_LOG_WARNING_ON_ROOT("Choose either oneFilePerProcess or amrFileFormat. amrFileFormat is set to false in this combination")
+      amrFileFormat_ = false;
+   }
+
+   if (useMPIIO_ && amrFileFormat_)
+   {
+      WALBERLA_LOG_WARNING_ON_ROOT("Choose either MPI-I0 or amrFileFormat. amrFileFormat is set to false in this combination")
+      amrFileFormat_ = false;
+   }
+
+   if (useMPIIO_ && oneFilePerProcess_)
+   {
+      WALBERLA_LOG_WARNING_ON_ROOT("Choose either MPI-I0 or oneFilePerProcess. oneFilePerProcess is set to false in this combination")
+      oneFilePerProcess_ = false;
+   }
+
    init( identifier );
 }
 
@@ -146,6 +167,10 @@ void VTKOutput::init( const std::string & identifier )
       if( filesystem::exists( pvd ) && executionCounter_ == 0 )
          std::remove( pvd.string().c_str() );
 
+      filesystem::path vthbSeries( baseFolder_ + "/" + identifier_ + ".vthb.series" );
+      if( filesystem::exists( vthbSeries ) && executionCounter_ == 0 )
+         std::remove( vthbSeries.string().c_str() );
+
       filesystem::path basePath( baseFolder_ );
       if( !filesystem::exists( basePath ) )
          filesystem::create_directories( basePath );
@@ -984,13 +1009,6 @@ void VTKOutput::writeBlocks( const std::string& path, const Set<SUID>& requiredS
 {
    WALBERLA_ASSERT_NOT_NULLPTR( blockStorage_ );
 
-   std::vector< const IBlock* > blocks;
-   for( auto block = blockStorage_->begin(); block != blockStorage_->end(); ++block )
-   {
-      if( selectable::isSetSelected( uid::globalState() + block->getState(), requiredStates, incompatibleStates ) )
-         blocks.push_back( block.get() );
-   }
-
    if( !configured_ ) {
       if( !forcePVTU_ && cellInclusionFunctions_.empty() && cellExclusionFunctions_.empty() &&
           blockStorage_->getNumberOfLevels() == 1 && ghostLayers_ == 0 ) // uniform data -> vti
@@ -1000,39 +1018,58 @@ void VTKOutput::writeBlocks( const std::string& path, const Set<SUID>& requiredS
       configured_ = true;
    }
 
-   for( auto it = blocks.begin(); it != blocks.end(); ++it )
+   if(!uniformGrid_ && oneFilePerProcess_)
    {
-      WALBERLA_ASSERT_NOT_NULLPTR( *it );
-      const IBlock& block = **it;
-
+      const int rank         = MPIManager::instance()->rank();
       std::ostringstream file;
-      file << path << "/block [" << block.getId() << "].";
-
-      if( uniformGrid_ ) // uniform data -> vti
+      file << path << "/dataRank[" << rank << "].vtu";
+      std::ofstream ofs(file.str().c_str());
+      writeParallelVTU( ofs, requiredStates, incompatibleStates );
+      ofs.close();
+   }
+   else
+   {
+      std::vector< const IBlock* > blocks;
+      for( auto block = blockStorage_->begin(); block != blockStorage_->end(); ++block )
       {
-         file << "vti";
-         std::ofstream ofs( file.str().c_str()  );
-         if( samplingDx_ <= real_c(0) || samplingDy_ <= real_c(0) || samplingDz_ <= real_c(0) )
-            writeVTI( ofs, block );
-         else
-            writeVTI_sampling( ofs, block );
-         ofs.close();
+         if( selectable::isSetSelected( uid::globalState() + block->getState(), requiredStates, incompatibleStates ) )
+            blocks.push_back( block.get() );
       }
-      else // unstructured data -> vtu
+      for( auto it = blocks.begin(); it != blocks.end(); ++it )
       {
-         CellVector cells; // cells to be written to file
-         computeVTUCells( block, cells );
+         WALBERLA_ASSERT_NOT_NULLPTR( *it );
+         const IBlock& block = **it;
+         const uint_t level = blockStorage_->getLevel(block);
 
-         if( !cells.empty() )
+         std::ostringstream file;
+         file << path << "/block [" << block.getId() << "] level[" << level << "].";
+
+         if( uniformGrid_ || amrFileFormat_ ) // uniform data -> vti  amr data -> vti
          {
-            file << "vtu";
+            file << "vti";
             std::ofstream ofs( file.str().c_str()  );
             if( samplingDx_ <= real_c(0) || samplingDy_ <= real_c(0) || samplingDz_ <= real_c(0) )
-               writeVTU( ofs, block, cells );
+               writeVTI( ofs, block );
             else
-               writeVTU_sampling( ofs, block, cells );
+               writeVTI_sampling( ofs, block );
             ofs.close();
          }
+         else // unstructured data -> vtu
+         {
+            CellVector cells; // cells to be written to file
+            computeVTUCells( block, cells );
+
+            if( !cells.empty() )
+            {
+               file << "vtu";
+               std::ofstream ofs( file.str().c_str()  );
+               if( samplingDx_ <= real_c(0) || samplingDy_ <= real_c(0) || samplingDz_ <= real_c(0) )
+                  writeVTU( ofs, block, cells );
+               else
+                  writeVTU_sampling( ofs, block, cells );
+               ofs.close();
+            }
+         }
       }
    }
 }
@@ -1093,6 +1130,7 @@ void VTKOutput::writeVTI( std::ostream& ofs, const IBlock& block ) const
 {
    const CellInterval& cellBB = blockStorage_->getBlockCellBB( block );
    const AABB&         domain = blockStorage_->getDomain();
+   const uint_t        level  = blockStorage_->getLevel( block );
 
    ofs << "<?xml version=\"1.0\"?>\n"
        << "<VTKFile type=\"ImageData\" version=\"0.1\" byte_order=\"" << endianness_ << "\">\n"
@@ -1100,7 +1138,7 @@ void VTKOutput::writeVTI( std::ostream& ofs, const IBlock& block ) const
        << cellBB.yMin() << " " << ( cellBB.yMax() + 1 ) << " "
        << cellBB.zMin() << " " << ( cellBB.zMax() + 1 ) << "\""
        << " Origin=\"" << domain.xMin() << " " << domain.yMin() << " " << domain.zMin() << "\""
-       << " Spacing=\"" << blockStorage_->dx() << " " << blockStorage_->dy() << " " << blockStorage_->dz() << "\">\n";
+       << " Spacing=\"" << blockStorage_->dx(level) << " " << blockStorage_->dy(level) << " " << blockStorage_->dz(level) << "\">\n";
 
    writeVTIPiece( ofs, block );
 
@@ -1208,6 +1246,74 @@ void VTKOutput::writeVTU( std::ostream& ofs, const IBlock& block, const CellVect
        << "</VTKFile>" << std::endl;
 }
 
+void VTKOutput::writeParallelVTU( std::ostream& ofs, const Set<SUID>& requiredStates, const Set<SUID>& incompatibleStates  ) const
+{
+   ofs << "<?xml version=\"1.0\"?>\n"
+       << "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"" << endianness_ << "\">\n"
+       << " <UnstructuredGrid>\n";
+
+   WALBERLA_ASSERT_NOT_NULLPTR(blockStorage_);
+   const uint_t finestLevel = blockStorage_->getNumberOfLevels() - 1;
+
+   std::map< Vertex, Index, VertexCompare > vimap; // vertex<->index map
+   std::vector< VertexCoord >               vc;    // vertex coordinates
+   std::vector< Index >                     ci;    // ci[0] to ci[7]: indices for cell number one, ci[8] to ci[15]: ...
+   uint_t numberOfCells = 0;
+
+   for( auto block = blockStorage_->begin(); block != blockStorage_->end(); ++block )
+   {
+      if( !selectable::isSetSelected( uid::globalState() + block->getState(), requiredStates, incompatibleStates ) )
+         continue;
+
+      // CellVector cells; // cells to be written to file
+      // computeVTUCells( *block, cells );
+
+      const uint_t level = blockStorage_->getLevel(*block);
+      const cell_idx_t factorToFinest = 1 << (finestLevel - level);
+      const CellInterval cells = blockStorage_->getBlockCellBB(*block); //  These are global cells
+
+      for (auto cell = cells.begin(); cell != cells.end(); ++cell)
+      {
+         numberOfCells++;
+         const AABB aabb = blockStorage_->getCellAABB(*cell, level);
+         for (cell_idx_t z = 0; z != 2; ++z) {
+            for (cell_idx_t y = 0; y != 2; ++y) {
+               for (cell_idx_t x = 0; x != 2; ++x)
+               {
+                  const Vertex v((cell->x() + x) * factorToFinest, (cell->y() + y) * factorToFinest, (cell->z() + z) * factorToFinest);
+                  auto mapping = vimap.find(v);
+                  if (mapping != vimap.end()) // vertex already exists
+                  {
+                     ci.push_back(mapping->second);
+                  }
+                  else // new vertex
+                  {
+                     vimap[v] = numeric_cast< Index >(vc.size());
+                     ci.push_back(numeric_cast< Index >(vc.size()));
+                     vc.emplace_back((x == 0) ? aabb.xMin() : aabb.xMax(),
+                                     (y == 0) ? aabb.yMin() : aabb.yMax(),
+                                     (z == 0) ? aabb.zMin() : aabb.zMax());
+                  }
+               }
+            }
+         }
+      }
+   }
+   // <--- setting up vertex-index mapping
+   writeVTUHeaderPiece(ofs, numberOfCells, vc, ci);
+
+   ofs << "   <CellData>\n";
+
+   writeCellData(ofs, requiredStates, incompatibleStates);
+
+   ofs << "   </CellData>\n"
+       << "  </Piece>\n";
+
+   ofs << " </UnstructuredGrid>\n"
+       << "</VTKFile>" << std::endl;
+
+}
+
 
 
 void VTKOutput::writeVTUPiece( std::ostream& ofs, const IBlock& block, const CellVector& cells ) const
@@ -1561,6 +1667,48 @@ void VTKOutput::writeCellData( std::ostream& ofs, const IBlock& block, const Cel
 }
 
 
+void VTKOutput::writeCellData( std::ostream& ofs, const Set<SUID>& requiredStates, const Set<SUID>& incompatibleStates ) const
+{
+   WALBERLA_ASSERT_NOT_NULLPTR( blockStorage_ );
+
+   for( auto writer = cellDataWriter_.begin(); writer != cellDataWriter_.end(); ++writer )
+   {
+      ofs << "    <DataArray type=\"" << (*writer)->typeString() << "\" Name=\"" << (*writer)->identifier()
+          << "\" NumberOfComponents=\"" << (*writer)->fSize() << "\" format=\"" << format_ << "\">\n";
+
+      for( auto block = blockStorage_->begin(); block != blockStorage_->end(); ++block )
+      {
+         if (!selectable::isSetSelected(uid::globalState() + block->getState(), requiredStates, incompatibleStates))
+            continue;
+
+         CellVector cells; // cells to be written to file
+         computeVTUCells(*block, cells);
+         (*writer)->configure( *block, *blockStorage_ );
+
+         if( binary_ )
+         {
+            Base64Writer base64;
+            for( auto cell = cells.begin(); cell != cells.end(); ++cell )
+               for( uint_t f = 0; f != (*writer)->fSize(); ++f )
+                  (*writer)->push( base64, cell->x(), cell->y(), cell->z(), cell_idx_c(f) );
+            ofs << "     "; base64.toStream( ofs );
+         }
+         else
+         {
+            for( auto cell = cells.begin(); cell != cells.end(); ++cell ) {
+               ofs << "     ";
+               for( uint_t f = 0; f != (*writer)->fSize(); ++f )
+               {
+                  (*writer)->push( ofs, cell->x(), cell->y(), cell->z(), cell_idx_c(f) );
+                  ofs << ( ( f == (*writer)->fSize() - 1 ) ? "\n" : " " );
+               }
+            }
+         }
+      }
+      ofs << "    </DataArray>\n";
+   }
+}
+
 
 void VTKOutput::writeCellData( std::ostream& ofs, const IBlock& block, const std::vector< SamplingCell >& cells ) const
 {
@@ -1614,7 +1762,11 @@ void VTKOutput::writeCollectors( const bool barrier )
 
    WALBERLA_ASSERT_EQUAL( MPIManager::instance()->worldRank(), 0 );
 
-   writePVD();
+   if(!amrFileFormat_)
+      writePVD();
+
+
+
 
    for( auto collector = collectorsToWrite_.begin(); collector != collectorsToWrite_.end(); ++collector )
    {
@@ -1625,6 +1777,11 @@ void VTKOutput::writeCollectors( const bool barrier )
          else
             writePVTI_sampled( *collector );
       }
+      else if (amrFileFormat_)
+      {
+         writeVTHBSeries();
+         writeVTHB( *collector );
+      }
       else
       {
          writePVTU( *collector ); // also applies for outputDomainDecomposition_ == true and pointDataSource_ != NULL
@@ -1683,7 +1840,7 @@ void VTKOutput::writePVD()
    }
    else
    {
-      ending = ".pvtu";
+      ending = amrFileFormat_? ".vthb" : ".pvtu";
       if( uniformGrid_ )
          ending = ".pvti";
    }
@@ -1705,6 +1862,62 @@ void VTKOutput::writePVD()
 }
 
 
+void VTKOutput::writeVTHBSeries()
+{
+   if ( !useMPIIO_  && collectorsToWrite_.empty() )
+      return;
+
+   std::string file( baseFolder_ + "/" + identifier_ + ".vthb.series" );
+   std::fstream ofs( file.c_str() );
+
+   if( !ofs ) // failed because file does not yet exist
+   {
+      ofs.open( file.c_str(), std::ios::out );
+
+      ofs << "{\n"
+          << "   \"file-series-version\" : \"1.0\",\n"
+          << "   \"files\" : [\n";
+   }
+   else if( pvdEnd_ == std::streampos(-2) )
+   {
+      for( std::string line; std::getline(ofs, line); )
+      {
+         if( line.find("]") != std::string::npos )
+         {
+            WALBERLA_ASSERT_GREATER( ofs.tellg(), 0 );
+            pvdEnd_ = ofs.tellg();
+            pvdEnd_ -= int_c(line.size()) + 1;
+            break;
+         }
+      }
+      WALBERLA_ASSERT_GREATER( pvdEnd_, 0 );
+      ofs.seekp(pvdEnd_);
+   }
+   else
+   {
+      ofs.seekp(pvdEnd_);
+   }
+   WALBERLA_ASSERT_GREATER(ofs.tellp(), 0);
+
+   std::string ending = ".vthb";
+
+   for( auto collector = allCollectors_.begin(); collector != allCollectors_.end(); ++collector )
+   {
+      std::ostringstream collection;
+      collection << identifier_ << "/" << executionFolder_ << "_" << *collector << ending;
+      ofs << "      { \"name\" : \"" << collection.str() << "\", \"time\":" << *collector << " },\n";
+   }
+   allCollectors_.clear();
+
+   pvdEnd_ = ofs.tellp();
+   WALBERLA_ASSERT_GREATER( pvdEnd_, 0 );
+   ofs << "   ]\n"
+       << "}\n";
+
+   ofs.close();
+}
+
+
 
 void VTKOutput::writePVTI( const uint_t collector ) const
 {
@@ -1753,6 +1966,40 @@ void VTKOutput::writePVTI( const uint_t collector ) const
    ofs.close();
 }
 
+
+void VTKOutput::writeVTHB( const uint_t collector ) const
+{
+   WALBERLA_ASSERT_NOT_NULLPTR( blockStorage_ );
+
+   std::ostringstream collection;
+   collection << baseFolder_ << "/" << identifier_ << "/" << executionFolder_ << "_" << collector << ".vthb";
+   std::ofstream ofs( collection.str().c_str() );
+
+   ofs << "<?xml version=\"1.0\"?>\n"
+       << "<VTKFile type=\"vtkNonOverlappingAMR\" version=\"1.1\" byte_order=\"" << endianness_ << "\"" << " header_type=\"UInt32\" compressor=\"vtkZLibDataCompressor\">\n"
+       << " <vtkNonOverlappingAMR>" << "\n";
+
+   std::vector< std::vector< filesystem::path >> files;
+   uint_t levels = blockStorage_->getNumberOfLevels();
+   files.resize(levels);
+   getFilenamesSortedByLevel( files, collector );
+
+   for( uint_t level = 0; level < files.size(); level++){
+      ofs << "  <Block level=\"" << level << "\">\n";
+      walberla::uint_t index = 0;
+      for( auto file = files[level].begin(); file != files[level].end(); ++file ){
+         ofs << "   <DataSet index=\"" << index << "\" file=\"" << executionFolder_ << "_" << collector << "/" << file->filename().string() << "\"/>\n";
+         index++;
+      }
+      ofs << "  </Block>\n";
+   }
+
+   ofs << " </vtkNonOverlappingAMR>\n"
+       << "</VTKFile>\n";
+
+   ofs.close();
+}
+
 void VTKOutput::writePVTI_sampled( const uint_t collector ) const
 {
    WALBERLA_ASSERT_NOT_NULLPTR( blockStorage_ );
@@ -1996,6 +2243,31 @@ void VTKOutput::getFilenames( std::vector< filesystem::path >& files, const uint
 }
 
 
+void VTKOutput::getFilenamesSortedByLevel( std::vector< std::vector< filesystem::path >>& blocks, const uint_t collector ) const
+{
+   std::ostringstream path;
+   path << baseFolder_ << "/" << identifier_ << "/" << executionFolder_ << "_" << collector;
+   filesystem::path directory( path.str() );
+
+   WALBERLA_ASSERT( filesystem::exists( directory ) );
+
+   for( filesystem::directory_iterator file( directory ); file != filesystem::directory_iterator(); ++file )
+   {
+      std::string fileName = file->path().string();
+      WALBERLA_ASSERT( filesystem::is_regular_file( file->path() ) && !filesystem::is_directory( file->path() ) );
+
+      std::size_t pos1 = fileName.find("level[");
+      WALBERLA_ASSERT_UNEQUAL(pos1, std::string::npos, "file names of the block data must contain the block level for AMR data files")
+      std::size_t pos2 = fileName.find("].vti");
+      WALBERLA_ASSERT_UNEQUAL(pos2, std::string::npos, "files must be in vti format")
+      std::size_t len = pos2 - (pos1 + 6);
+      uint_t level = uint_c(std::stoi(fileName.substr(pos1 + 6, len)));
+      WALBERLA_ASSERT_LESS(level, blocks.size())
+      blocks[level].push_back(file->path());
+   }
+}
+
+
 
 void VTKOutput::writePPointData( std::ofstream& ofs ) const
 {
diff --git a/src/vtk/VTKOutput.h b/src/vtk/VTKOutput.h
index d90586b5ca9d3cb5bf516733e4be98254a015768..74f12f05a58e7aeeb16c53b85bbe161181b2b8e9 100644
--- a/src/vtk/VTKOutput.h
+++ b/src/vtk/VTKOutput.h
@@ -125,7 +125,7 @@ public:
                                                                   const uint_t writeFrequency, const uint_t ghostLayers, const bool forcePVTU,
                                                                   const std::string & baseFolder, const std::string & executionFolder,
                                                                   const bool continuousNumbering, const bool binary, const bool littleEndian,
-                                                                  const bool useMPIIO, const uint_t initialExecutionCount );
+                                                                  const bool useMPIIO, const uint_t initialExecutionCount, const bool amrFileFormat, const bool oneFilePerProcess );
 
    /// creates a VTKOutput object that is supposed to output arbitrary point data
    friend inline shared_ptr<VTKOutput> createVTKOutput_PointData( const shared_ptr< PointDataSource > pds, const std::string & identifier,
@@ -192,7 +192,8 @@ private:
    VTKOutput( const StructuredBlockStorage & sbs, const std::string & identifier, const uint_t writeFrequency,
               const std::string & baseFolder, const std::string & executionFolder,
               const bool continuousNumbering, const bool binary, const bool littleEndian, const bool useMPIIO,
-              const uint_t ghostLayers, const bool forcePVTU, const uint_t initialExecutionCount = 0 );
+              const uint_t ghostLayers, const bool forcePVTU, const uint_t initialExecutionCount = 0,
+              const bool amrFileFormat = false, const bool oneFilePerProcess = false );
 
    /// creates a VTKOutput object that is supposed to output arbitrary point data
    VTKOutput( const shared_ptr< PointDataSource >& pds, const std::string & identifier, const uint_t writeFrequency,
@@ -243,6 +244,7 @@ private:
 
    void writeVTU( std::ostream& ofs, const IBlock& block, const CellVector& cells ) const;
    void writeVTU_sampling( std::ostream& ofs, const IBlock& block, const CellVector& cells ) const;
+   void writeParallelVTU( std::ostream& ofs, const Set<SUID>& requiredStates, const Set<SUID>& incompatibleStates ) const;
 
    void writeVTUPiece(std::ostream& ofs, const IBlock& block, const CellVector& cells) const;
    void writeVTUPiece_sampling(std::ostream& ofs, const IBlock& block, const CellVector& cells) const;
@@ -255,12 +257,15 @@ private:
    std::vector< SamplingCell > getSamplingCells( const IBlock& block, const CellVector& cells ) const;
 
    void writeCellData( std::ostream& ofs, const IBlock& block, const CellVector& cells ) const;
+   void writeCellData( std::ostream& ofs, const Set<SUID>& requiredStates, const Set<SUID>& incompatibleStates ) const;
    void writeCellData( std::ostream& ofs, const IBlock& block, const std::vector< SamplingCell >& cells ) const;
 
    void writePVD();
+   void writeVTHBSeries();
 
    void writePVTI( const uint_t collector ) const;
    void writePVTI_sampled( const uint_t collector ) const;
+   void writeVTHB( const uint_t collector ) const;
    void writePVTU( const uint_t collector ) const;
 
    bool writeCombinedVTI( std::string localPart, const uint_t collector ) const;
@@ -268,6 +273,7 @@ private:
    bool writeCombinedVTU(std::string localPart, const uint_t collector) const;
 
    void getFilenames( std::vector< filesystem::path >& blocks, const uint_t collector ) const;
+   void getFilenamesSortedByLevel( std::vector< std::vector< filesystem::path >>& blocks, const uint_t collector ) const;
    void writePPointData( std::ofstream& ofs ) const;
    void writePCellData( std::ofstream& ofs ) const;
 
@@ -311,6 +317,8 @@ private:
    const bool forcePVTU_;
          bool configured_;
          bool uniformGrid_;
+         bool amrFileFormat_;
+         bool oneFilePerProcess_;
 
    const uint_t ghostLayers_;
 
@@ -585,10 +593,10 @@ inline shared_ptr<VTKOutput> createVTKOutput_BlockData( const StructuredBlockSto
                                                         const std::string & executionFolder = std::string("simulation_step"),
                                                         const bool continuousNumbering = false, const bool binary = true,
                                                         const bool littleEndian = true, const bool useMPIIO = true,
-                                                        const uint_t initialExecutionCount = 0 )
+                                                        const uint_t initialExecutionCount = 0, const bool amrFileFormat = false, const bool oneFilePerProcess = false )
 {
    return shared_ptr<VTKOutput>( new VTKOutput( sbs, identifier, writeFrequency, baseFolder, executionFolder,
-                                                continuousNumbering, binary, littleEndian, useMPIIO, ghostLayers, forcePVTU, initialExecutionCount ) );
+                                                continuousNumbering, binary, littleEndian, useMPIIO, ghostLayers, forcePVTU, initialExecutionCount, amrFileFormat, oneFilePerProcess ) );
 }
 
 inline shared_ptr<VTKOutput> createVTKOutput_BlockData( const shared_ptr< const StructuredBlockStorage > & sbs,
@@ -598,13 +606,13 @@ inline shared_ptr<VTKOutput> createVTKOutput_BlockData( const shared_ptr< const
                                                         const std::string & executionFolder = std::string("simulation_step"),
                                                         const bool continuousNumbering = false, const bool binary = true,
                                                         const bool littleEndian = true, const bool useMPIIO = true,
-                                                        const uint_t initialExecutionCount = 0 )
+                                                        const uint_t initialExecutionCount = 0, const bool amrFileFormat = false )
 {
    if( !sbs )
       WALBERLA_ABORT( "creating VTK output for block data failed (StructuredBlockStorage shared pointer is NULL)" );
 
    return createVTKOutput_BlockData( *sbs, identifier, writeFrequency, ghostLayers, forcePVTU, baseFolder, executionFolder,
-                                               continuousNumbering, binary, littleEndian, useMPIIO, initialExecutionCount );
+                                               continuousNumbering, binary, littleEndian, useMPIIO, initialExecutionCount, amrFileFormat );
 }
 
 
diff --git a/tests/core/CMakeLists.txt b/tests/core/CMakeLists.txt
index 8d3f0298ac3bf387dfb19d18b46bfaff8caa6912..604de6371b573b350638ea4fb4f003d93960cb7c 100644
--- a/tests/core/CMakeLists.txt
+++ b/tests/core/CMakeLists.txt
@@ -119,6 +119,11 @@ waLBerla_execute_test( NAME SetReductionTest4  COMMAND $<TARGET_FILE:SetReductio
 waLBerla_execute_test( NAME SetReductionTest5  COMMAND $<TARGET_FILE:SetReductionTest> PROCESSES 5 )
 waLBerla_execute_test( NAME SetReductionTest27 COMMAND $<TARGET_FILE:SetReductionTest> PROCESSES 27 )
 
+if ( WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT )
+   waLBerla_compile_test( Name MPIFloat16Test FILES mpi/MPIFloat16Test.cpp DEPENDS core )
+   target_compile_features( MPIFloat16Test PUBLIC cxx_std_23 )
+   waLBerla_execute_test( NAME MPIFloat16Test4 COMMAND $<TARGET_FILE:MPIFloat16Test> PROCESSES 4)
+endif ()
 
 
 ##############
@@ -172,9 +177,6 @@ waLBerla_compile_test( FILES DebugSTLTest.cpp )
 waLBerla_execute_test( NAME DebugSTLTest )
 set_tests_properties(DebugSTLTest PROPERTIES WILL_FAIL TRUE)
 
-waLBerla_compile_test( FILES FP16Test.cpp )
-waLBerla_execute_test( NAME FP16Test )
-
 waLBerla_compile_test( FILES FunctionTraitsTest.cpp )
 waLBerla_execute_test( NAME FunctionTraitsTest )
 
@@ -235,4 +237,7 @@ if ( WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT )
    # Which features are actually supported depend on the compiler
    target_compile_features( Float16SupportTest PUBLIC cxx_std_23 )
    waLBerla_execute_test(NAME Float16SupportTest)
+
+   waLBerla_compile_test( FILES FP16Test.cpp )
+   waLBerla_execute_test( NAME FP16Test )
 endif ()
\ No newline at end of file
diff --git a/tests/core/FP16Test.cpp b/tests/core/FP16Test.cpp
index 60a2be0eeee0872449f6a648fa1c65abbbda7f42..e08dd55b099c8edc36f97ab4f86a613d6671d1ac 100644
--- a/tests/core/FP16Test.cpp
+++ b/tests/core/FP16Test.cpp
@@ -70,7 +70,7 @@ void fp16Test( int argc, char ** argv )
    const float16 y = -1.8f16;
    const float64 z = -0.6;
    WALBERLA_LOG_INFO_ON_ROOT("   + " << (double) x << " + " << (double) y << " == " << (float64) (x + y) << " ? ")
-   WALBERLA_CHECK_FLOAT_EQUAL((float64) (x + y), z, "float16 addition does not work correctly.");
+   WALBERLA_CHECK_FLOAT_EQUAL( (x + y), (float16) z, "float16 addition does not work correctly.");
 #endif
 }
 
diff --git a/tests/core/Float16SupportTest.cpp b/tests/core/Float16SupportTest.cpp
index 04ea9378f54eee4ee78f81177fc609d732da21c5..5116886ff4e154cecb21465acc39e40caea776dc 100644
--- a/tests/core/Float16SupportTest.cpp
+++ b/tests/core/Float16SupportTest.cpp
@@ -19,14 +19,16 @@
 //
 //======================================================================================================================
 
-#include <memory>
-#include <numeric>
-
 #include "core/DataTypes.h"
 #include "core/Environment.h"
 #include "core/logging/Logging.h"
 
+#include <numeric>
+
 namespace walberla::simple_Float16_test {
+
+
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
 using walberla::floatIsEqual;
 using walberla::real_t;
 using walberla::uint_c;
@@ -90,12 +92,12 @@ void vector_test()
    fpDst_cast.assign( 10, (dst_t) 1.5 );
    fp32.assign( 10, 1.5f );
    std::copy( fpSrc.begin(), fpSrc.end(), fpDst.begin() );
-   WALBERLA_LOG_WARNING_ON_ROOT(
+   WALBERLA_LOG_INFO_ON_ROOT(
        " std::vector.assign is not able to assign "
        << typeid( src_t ).name() << " values to container of type " << precisionType << ".\n"
        << " Therefore, the floating point value for assign must be cast beforehand or std::copy must be used, since it uses a static_cast internally." );
 
-   fpSrc[5]      = 2.3;
+   fpSrc[5]      = real_c(2.3);
    fpDst_cast[5] = (dst_t) 2.3;
    fp32[5]       = 2.3f;
    fpDst[5]      = (dst_t) 2.3;
@@ -118,7 +120,7 @@ void vector_test()
       WALBERLA_CHECK_FLOAT_EQUAL( (dst_t)sumSrc, sumDst );
    }
    {
-      fpSrc.assign( 13, 1.3 );
+      fpSrc.assign(13, real_c(1.3));
       std::copy( fpSrc.begin(), fpSrc.end(), fpDst.begin() );
       const auto sumSrc = std::reduce(fpSrc.begin(), fpSrc.end());
       const auto sumDst = std::reduce(fpDst.begin(), fpDst.end());
@@ -126,8 +128,11 @@ void vector_test()
    }
 } // simple_Float16_test::vector_test()
 
+#endif // WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+
 int main( int argc, char** argv )
 {
+#ifdef  WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
    // This check only works since C++23 and is used in many implementations, so it's important, that it works.
    WALBERLA_CHECK( std::is_arithmetic< dst_t >::value );
 
@@ -149,15 +154,17 @@ int main( int argc, char** argv )
    WALBERLA_LOG_INFO_ON_ROOT( " Start a where float32 is sufficient but float16 is not." );
    WALBERLA_CHECK_FLOAT_UNEQUAL( dst_t(1.0)-dst_t(0.3), 1.0-0.3 );
    WALBERLA_CHECK_FLOAT_EQUAL( 1.0f-0.3f, 1.0-0.3 );
+#else
+   WALBERLA_LOG_WARNING_ON_ROOT( "\nWALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT is not enabled. So this test cannot be run!\n" )
+#endif // WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
 
-   return 0;
+   return EXIT_SUCCESS;
 } // simple_Float16_test::main()
 
 } // namespace walberla::simple_Float16_test
 
 int main( int argc, char** argv )
 {
-   walberla::simple_Float16_test::main( argc, argv );
+   return walberla::simple_Float16_test::main( argc, argv );
 
-   return EXIT_SUCCESS;
 } // main()
diff --git a/tests/core/mpi/MPIFloat16Test.cpp b/tests/core/mpi/MPIFloat16Test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..be0bc5aa4f86eacc22f223c165f1dd943d5dbd56
--- /dev/null
+++ b/tests/core/mpi/MPIFloat16Test.cpp
@@ -0,0 +1,162 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file MPIFloat16.cpp
+//! \ingroup core
+//! \author Michael Zikeli <michael.zikeli@fau.de>
+//! \brief This test is to check whether the self defined MPI_Datatype and the self defined Operators are working.
+//!    To verify the type, some parts of the BufferSystemTest are just copied.
+//!    To verify the operations, a simple AllReduce is executed for each operation.
+//!        For now only { SUM, MIN, MAX } are implemented, thus only those are tested.
+//
+//======================================================================================================================
+
+#include "core/Abort.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Logging.h"
+#include "core/mpi/BufferSystem.h"
+#include "core/mpi/Environment.h"
+#include "core/mpi/Reduce.h"
+
+
+namespace walberla::mpifloat16test
+{
+
+using mpi::BufferSystem;
+using namespace std::literals::chrono_literals;
+
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+void symmetricCommunication()
+{
+   const int MSG_SIZE = 10;
+
+   auto mpiManager = MPIManager::instance();
+
+   const int numProcesses  = mpiManager->numProcesses();
+   const int rank          = mpiManager->worldRank();
+   const int leftNeighbor  = (rank-1+numProcesses)  % numProcesses;
+   const int rightNeighbor = (rank+1) % numProcesses;
+
+   WALBERLA_CHECK_GREATER_EQUAL( numProcesses, 3 );
+
+
+   BufferSystem bs ( MPI_COMM_WORLD );
+
+   // Pack Message to left neighbor containing own rank
+   for( int i=0; i< MSG_SIZE; ++i )
+      bs.sendBuffer( leftNeighbor )  << numeric_cast<float16>(rank) + float16(0.3);
+
+   // Pack Message to right neighbor containing own rank
+   for( int i=0; i< MSG_SIZE; ++i )
+      bs.sendBuffer( rightNeighbor ) << numeric_cast<float16>(rank) - float16(0.3);
+
+   bs.setReceiverInfoFromSendBufferState( true, false );
+   bs.sendAll();
+
+   for( auto it = bs.begin(); it != bs.end(); ++it )
+   {
+      WALBERLA_CHECK ( it.rank() == leftNeighbor || it.rank() == rightNeighbor );
+      WALBERLA_CHECK_EQUAL( it.buffer().size(), MSG_SIZE * sizeof(float16) + MSG_SIZE * mpi::BUFFER_DEBUG_OVERHEAD );
+
+      auto receivedVal = float16(-1);
+      it.buffer() >> receivedVal;
+
+      WALBERLA_CHECK_EQUAL( typeid(receivedVal), typeid(float16) );
+
+      if ( it.rank() == leftNeighbor )
+      {
+         WALBERLA_CHECK_FLOAT_EQUAL( receivedVal, numeric_cast<float16>( it.rank() ) - float16(0.3) );
+         WALBERLA_CHECK_FLOAT_UNEQUAL( receivedVal, numeric_cast<float16>( it.rank() ) + float16(0.3), 0.5);
+      } else {
+         WALBERLA_CHECK_FLOAT_EQUAL( receivedVal, numeric_cast<float16>( it.rank() ) + float16(0.3) );
+         WALBERLA_CHECK_FLOAT_UNEQUAL( receivedVal, numeric_cast<float16>( it.rank() ) - float16(0.3), 0.5);
+      }
+   }
+
+   WALBERLA_CHECK_EQUAL( bs.getBytesSent(), (MSG_SIZE * sizeof(float16) + MSG_SIZE * mpi::BUFFER_DEBUG_OVERHEAD) * 2 );
+   WALBERLA_CHECK_EQUAL( bs.getBytesReceived(), (MSG_SIZE * sizeof(float16) + MSG_SIZE * mpi::BUFFER_DEBUG_OVERHEAD) * 2 );
+}
+
+void reduce( )
+{
+
+   auto mpiManager = MPIManager::instance();
+
+   const int numProcesses  = mpiManager->numProcesses();
+   const int rank          = mpiManager->worldRank();
+
+   const auto startValue = numeric_cast<float16>(rank) + float16(0.3);
+
+   // SUM
+   auto value = startValue;
+
+   walberla::mpi::allReduceInplace( value, walberla::mpi::SUM );
+
+   auto sum = float16( 0.0 );
+   for( int i = 0; i < numProcesses; ++i )
+      sum += numeric_cast<float16>(i) + float16(0.3);
+   WALBERLA_CHECK_FLOAT_EQUAL( value, sum );
+   WALBERLA_CHECK_FLOAT_UNEQUAL( value, ((numProcesses*(numProcesses-1)/2.)+0.3*numProcesses), 1e-10 );
+
+   // MIN
+   value = startValue;
+
+   walberla::mpi::allReduceInplace( value, walberla::mpi::MIN );
+   WALBERLA_CHECK_FLOAT_EQUAL( value, numeric_cast<float16>(0.3) );
+
+   // MAX
+   value = startValue;
+
+   walberla::mpi::allReduceInplace( value, walberla::mpi::MAX );
+   WALBERLA_CHECK_FLOAT_EQUAL( value, numeric_cast<float16>(numProcesses-1)+numeric_cast<float16>(0.3) );
+
+}
+#endif // WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+
+int main( int argc, char** argv )
+{
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+   mpi::Environment mpiEnv( argc, argv );
+   debug::enterTestMode();
+   walberla::logging::Logging::instance()->setLogLevel( walberla::logging::Logging::INFO );
+
+   auto mpiManager   = MPIManager::instance();
+   auto numProcesses = mpiManager->numProcesses();
+
+   if(numProcesses <= 2)
+   {
+      WALBERLA_ABORT("This test has to be executed on at least 3 processes. Executed on " <<  numProcesses);
+      return EXIT_FAILURE;
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT("Testing Symmetric Communication...");
+   symmetricCommunication();
+
+   WALBERLA_LOG_INFO_ON_ROOT("Testing Reduce operations...");
+   reduce( );
+#else
+   WALBERLA_LOG_WARNING_ON_ROOT( "\nWALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT is not enabled. So this test cannot be run!\n" )
+#endif // WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+
+   return EXIT_SUCCESS;
+
+} // mpifloat16test::main()
+
+} // namespace walberla::mpifloat16test
+
+int main( int argc, char** argv )
+{
+   return walberla::mpifloat16test::main( argc, argv );
+} // main()
diff --git a/tests/lbm/codegen/GeneratedFreeSlip.py b/tests/lbm/codegen/GeneratedFreeSlip.py
index e079cbe750d88a704c9273d714d80656349e1f1f..65d84275cda0c1d25bc5c2d55ac256fab2c0f3e6 100644
--- a/tests/lbm/codegen/GeneratedFreeSlip.py
+++ b/tests/lbm/codegen/GeneratedFreeSlip.py
@@ -14,7 +14,7 @@ import sympy as sp
 
 with CodeGeneration() as ctx:
     data_type = "float64" if ctx.double_accuracy else "float32"
-    stencil = LBStencil(Stencil.D3Q19)
+    stencil = LBStencil(Stencil.D3Q27)
 
     pdfs, pdfs_tmp = fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {data_type}[{stencil.D}D]",
                             layout='fzyx')
diff --git a/tests/lbm_generated/CMakeLists.txt b/tests/lbm_generated/CMakeLists.txt
index d7a7ef76bd6b9a605d19cd7e2bbdf4bedddbed7b..8ba33e735229aacb7a91f89f2f3912d8d1b61f85 100644
--- a/tests/lbm_generated/CMakeLists.txt
+++ b/tests/lbm_generated/CMakeLists.txt
@@ -6,8 +6,11 @@
 waLBerla_link_files_to_builddir( "*.prm" )
 waLBerla_link_files_to_builddir( "*.py" )
 
+if( WALBERLA_BUILD_WITH_CODEGEN )
+
 waLBerla_generate_target_from_python(NAME ExampleGenerated
         FILE Example.py
+        CODEGEN_CFG example_codegen
         OUT_FILES LBMStorageSpecification.h LBMStorageSpecification.cpp
         LBMSweepCollection.h LBMSweepCollection.cpp
         NoSlip.h NoSlip.cpp
@@ -16,6 +19,36 @@ waLBerla_generate_target_from_python(NAME ExampleGenerated
         Example_InfoHeader.h)
 waLBerla_compile_test( FILES Example.cpp DEPENDS ExampleGenerated blockforest field lbm_generated timeloop )
 
+waLBerla_generate_target_from_python(NAME InterpolationNoSlipGenerated
+        FILE InterpolationNoSlip.py
+        CODEGEN_CFG interpolation_no_slip_codegen
+        OUT_FILES InterpolationNoSlipStorageSpecification.h InterpolationNoSlipStorageSpecification.cpp
+        InterpolationNoSlipSweepCollection.h InterpolationNoSlipSweepCollection.cpp
+        NoSlip.h NoSlip.cpp
+        NoSlipBouzidi.h NoSlipBouzidi.cpp
+        NoSlipQuadraticBB.h NoSlipQuadraticBB.cpp
+        UBB.h UBB.cpp
+        InterpolationNoSlipBoundaryCollection.h
+        InterpolationNoSlipHeader.h)
+
+waLBerla_compile_test( FILES InterpolationNoSlip.cpp DEPENDS InterpolationNoSlipGenerated blockforest core field geometry lbm_generated timeloop )
+# waLBerla_execute_test( NAME InterpolationNoSlip1 COMMAND $<TARGET_FILE:InterpolationNoSlip> ${CMAKE_CURRENT_SOURCE_DIR}/InterpolationNoSlip.prm -Parameters.distanceWall=0.1 )
+# waLBerla_execute_test( NAME InterpolationNoSlip2 COMMAND $<TARGET_FILE:InterpolationNoSlip> ${CMAKE_CURRENT_SOURCE_DIR}/InterpolationNoSlip.prm -Parameters.distanceWall=0.5 )
+waLBerla_execute_test( NAME InterpolationNoSlip3 COMMAND $<TARGET_FILE:InterpolationNoSlip> ${CMAKE_CURRENT_SOURCE_DIR}/InterpolationNoSlip.prm )
+endif()
+
+waLBerla_generate_target_from_python(NAME FreeSlipRefinementGenerated
+        FILE FreeSlipRefinement.py
+        CODEGEN_CFG free_slip_refinement_codegen
+        OUT_FILES FreeSlipRefinementStorageSpecification.h FreeSlipRefinementStorageSpecification.cpp
+        FreeSlipRefinementSweepCollection.h FreeSlipRefinementSweepCollection.cpp
+        FreeSlip.h FreeSlip.cpp
+        UBB.h UBB.cpp
+        Outflow.h Outflow.cpp
+        FreeSlipRefinementBoundaryCollection.h
+        FreeSlipRefinementInfoHeader.h)
+waLBerla_compile_test( FILES FreeSlipRefinement.cpp DEPENDS FreeSlipRefinementGenerated blockforest field lbm_generated timeloop )
+
 if( WALBERLA_DOUBLE_ACCURACY )
 waLBerla_compile_test( FILES LDC.cpp DEPENDS blockforest field lbm_generated timeloop )
 endif()
diff --git a/tests/lbm_generated/Example.cpp b/tests/lbm_generated/Example.cpp
index 4dfd69b553d88d268efb0c49c857eb391f6277ea..2e77ddcb2bfa24c924553afdf71fc5b8081a49ab 100644
--- a/tests/lbm_generated/Example.cpp
+++ b/tests/lbm_generated/Example.cpp
@@ -177,14 +177,14 @@ int main(int argc, char** argv)
 
    StorageSpecification_T StorageSpec = StorageSpecification_T();
    BlockDataID pdfFieldId = lbm_generated::addPdfFieldToStorage(blocks, "pdf field", StorageSpec, uint_c(2));
-   BlockDataID velFieldId = field::addToStorage< VectorField_T >(blocks, "Velocity", real_c(0.0), field::fzyx);
+   BlockDataID velFieldId = field::addToStorage< VectorField_T >(blocks, "Velocity", real_c(0.0), field::fzyx, uint_c(2));
 
    BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field", uint_c(3));
 
    SweepCollection_T sweepCollection(blocks, pdfFieldId, velFieldId, omega);
    for (auto& block : *blocks)
    {
-      sweepCollection.initialise(&block);
+      sweepCollection.initialise(&block, cell_idx_c(1));
    }
 
    const FlagUID fluidFlagUID("Fluid");
diff --git a/tests/lbm_generated/FreeSlipRefinement.cpp b/tests/lbm_generated/FreeSlipRefinement.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4a6d51a91ef1419a703596ffbd79cabb4f95cc32
--- /dev/null
+++ b/tests/lbm_generated/FreeSlipRefinement.cpp
@@ -0,0 +1,280 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FreeSlipRefinement.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//! \brief Channel flow with inlet and outlet on West and East. The rest of the Boundaries consist of FreeSlip. The
+//!        Channel flow should reach the inlet velocity in the whole domain because the FreeSlip BC will not provide a
+//!        restriction on it. With the D3Q27 stencil this only works if the BC is also set on the first fluid node
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/NonUniformBufferedScheme.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Initialization.h"
+#include "core/math/Vector3.h"
+#include "core/timing/RemainingTimeLogger.h"
+
+#include "field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/GhostLayerField.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/refinement/BasicRecursiveTimeStep.h"
+
+// include the generated header file. It includes all generated classes
+#include "FreeSlipRefinementInfoHeader.h"
+
+using namespace walberla;
+using namespace std::placeholders;
+
+using StorageSpecification_T = lbm::FreeSlipRefinementStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
+using PdfField_T             = lbm_generated::PdfField< StorageSpecification_T >;
+
+using SweepCollection_T = lbm::FreeSlipRefinementSweepCollection;
+
+using VectorField_T = GhostLayerField< real_t, StorageSpecification_T::Stencil::D >;
+using ScalarField_T = GhostLayerField< real_t, 1 >;
+
+using flag_t               = walberla::uint8_t;
+using FlagField_T          = FlagField< flag_t >;
+using BoundaryCollection_T = lbm::FreeSlipRefinementBoundaryCollection< FlagField_T >;
+
+using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction;
+
+class ChannelRefinement
+{
+ public:
+   ChannelRefinement(const uint_t depth) : refinementDepth_(depth){};
+
+   void operator()(SetupBlockForest& forest)
+   {
+      std::vector< SetupBlock* > blocks;
+      forest.getBlocks(blocks);
+      AABB refinementAABB = AABB(forest.getDomain().xSize() / 2 - 1, forest.getDomain().yMin(), forest.getDomain().zSize() / 2 - 1,
+                                 forest.getDomain().xSize() / 2 + 1, forest.getDomain().yMin() + 1, forest.getDomain().zSize() / 2 + 1 );
+
+      for (auto b : blocks)
+      {
+         if (refinementAABB.intersects(b->getAABB()) && b->getLevel() < refinementDepth_)
+         {
+            b->setMarker(true);
+         }
+      }
+   }
+
+ private:
+   const uint_t refinementDepth_;
+};
+
+class Channel
+{
+ public:
+   Channel(const uint_t depth) : refinementDepth_(depth), freeSlipFlagUID_("FreeSlip"), ubbFlagUID_("UBB"), outflowFlagUID_("Outflow"){};
+
+   RefinementSelectionFunctor refinementSelector() { return ChannelRefinement(refinementDepth_); }
+   void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID)
+   {
+      for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt)
+      {
+         Block& b           = dynamic_cast< Block& >(*bIt);
+         uint_t level       = b.getLevel();
+         auto flagField     = b.getData< FlagField_T >(flagFieldID);
+         uint8_t freeSlipFlag = flagField->registerFlag(freeSlipFlagUID_);
+         uint8_t ubbFlag    = flagField->registerFlag(ubbFlagUID_);
+         uint8_t outflowFlag    = flagField->registerFlag(outflowFlagUID_);
+         for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt)
+         {
+            Cell localCell = cIt.cell();
+            Cell globalCell(localCell);
+            sbfs.transformBlockLocalToGlobalCell(globalCell, b);
+            if (globalCell.x() < 0) { flagField->addFlag(localCell, ubbFlag); }
+            else if (globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level))) { flagField->addFlag(localCell, outflowFlag); }
+            else if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, freeSlipFlag); }
+            else if (globalCell.y() < cell_idx_c(1 << level)) {flagField->addFlag(localCell, freeSlipFlag);}
+         }
+      }
+   }
+
+ private:
+   const std::string refinementProfile_;
+   const uint_t refinementDepth_;
+
+   const FlagUID freeSlipFlagUID_;
+   const FlagUID ubbFlagUID_;
+   const FlagUID outflowFlagUID_;
+};
+
+static void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, Channel& setup)
+{
+   Vector3< real_t > domainSize = domainSetup.getParameter< Vector3< real_t > >("domainSize");
+   Vector3< uint_t > rootBlocks = domainSetup.getParameter< Vector3< uint_t > >("rootBlocks");
+   Vector3< bool > periodic     = domainSetup.getParameter< Vector3< bool > >("periodic");
+
+   auto refSelection = setup.refinementSelector();
+   setupBfs.addRefinementSelectionFunction(std::function< void(SetupBlockForest&) >(refSelection));
+   AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
+   setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
+   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalance(true), uint_c(MPIManager::instance()->numProcesses()));
+}
+
+int main(int argc, char** argv)
+{
+   walberla::Environment walberlaEnv(argc, argv);
+   mpi::MPIManager::instance()->useWorldComm();
+
+   logging::configureLogging(walberlaEnv.config());
+
+   // read parameters
+   auto domainSetup = walberlaEnv.config()->getOneBlock("DomainSetup");
+   auto parameters  = walberlaEnv.config()->getOneBlock("Parameters");
+
+   const real_t omega           = parameters.getParameter< real_t >("omega");
+   const real_t inletVelocity   = parameters.getParameter< real_t >("inletVelocity");
+   const uint_t timesteps       = parameters.getParameter< uint_t >("timesteps", uint_c(10)) + uint_c(1);
+   const uint_t refinementDepth = parameters.getParameter< uint_t >("refinementDepth", uint_c(1));
+
+   auto loggingParameters = walberlaEnv.config()->getOneBlock("Logging");
+   bool writeSetupForestAndReturn = loggingParameters.getParameter<bool>("writeSetupForestAndReturn", false);
+
+   auto remainingTimeLoggerFrequency =
+      parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
+
+   auto flowSetup = std::make_shared< Channel >(refinementDepth);
+
+   SetupBlockForest setupBfs;
+   WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
+   createSetupBlockForest(setupBfs, domainSetup, *flowSetup);
+
+   // Create structured block forest
+   Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+
+   WALBERLA_LOG_INFO_ON_ROOT("Creating structured block forest...")
+   auto bfs    = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+   auto blocks = std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
+   blocks->createCellBoundingBoxes();
+
+   if (writeSetupForestAndReturn)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Writing SetupBlockForest to VTK file")
+      WALBERLA_ROOT_SECTION() { vtk::writeDomainDecomposition(blocks, "FreeSlipRefinementDomainDecomposition", "vtk_out"); }
+
+      WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
+      for (uint_t level = 0; level <= refinementDepth; level++)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << setupBfs.getNumberOfBlocks(level))
+      }
+      WALBERLA_LOG_INFO_ON_ROOT("Ending program")
+      return EXIT_SUCCESS;
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
+   for (uint_t level = 0; level <= refinementDepth; level++)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << setupBfs.getNumberOfBlocks(level))
+   }
+
+   StorageSpecification_T StorageSpec = StorageSpecification_T();
+   BlockDataID pdfFieldId = lbm_generated::addPdfFieldToStorage(blocks, "pdf field", StorageSpec, uint_c(2));
+   BlockDataID velFieldId = field::addToStorage< VectorField_T >(blocks, "Velocity", real_c(0.0), field::fzyx, uint_c(2));
+
+   BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field", uint_c(3));
+
+   SweepCollection_T sweepCollection(blocks, pdfFieldId, velFieldId, omega);
+   for (auto& block : *blocks)
+   {
+      sweepCollection.initialise(&block, cell_idx_c(1));
+   }
+
+   const FlagUID fluidFlagUID("Fluid");
+   flowSetup->setupBoundaryFlagField(*blocks, flagFieldId);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID, 2);
+   BoundaryCollection_T boundaryCollection(blocks, flagFieldId, pdfFieldId, fluidFlagUID, inletVelocity);
+
+   WALBERLA_LOG_INFO_ON_ROOT("Setting up communication...")
+   auto comm =
+      std::make_shared< blockforest::communication::NonUniformBufferedScheme< CommunicationStencil_T > >(blocks);
+   auto packInfo = lbm_generated::setupNonuniformPdfCommunication< PdfField_T >(blocks, pdfFieldId);
+   comm->addPackInfo(packInfo);
+
+   lbm_generated::BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T > timestep(
+      blocks, pdfFieldId, sweepCollection, boundaryCollection, comm, packInfo);
+
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+   uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+   if (vtkWriteFrequency > 0)
+   {
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "FreeSlipRefinementVTK", vtkWriteFrequency, 0, false, "vtk_out",
+                                                      "simulation_step", false, true, true, false, 0);
+
+      auto velWriter = make_shared< field::VTKWriter< VectorField_T > >(velFieldId, "velocity");
+      vtkOutput->addBeforeFunction([&]() {
+         for (auto& block : *blocks)
+         {
+            sweepCollection.calculateMacroscopicParameters(&block);
+         }
+      });
+
+      vtkOutput->addCellDataWriter(velWriter);
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+   timeloop.addFuncAfterTimeStep(timestep);
+
+   // log remaining time
+   if (remainingTimeLoggerFrequency > 1.0)
+   {
+      timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
+                                    "remaining time logger");
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT("Starting Simulation with " << timesteps << " timesteps")
+
+   timeloop.run();
+
+
+   for (auto& block : *blocks)
+   {
+      sweepCollection.calculateMacroscopicParameters(&block);
+      Block& b           = dynamic_cast< Block& >(block);
+      uint_t level       = b.getLevel();
+
+      auto velField  = b.getData< VectorField_T >(velFieldId);
+      for( auto it = velField->beginXYZ(); it != velField->end(); ++it )
+      {
+         Cell localCell = it.cell();
+         Cell globalCell(localCell);
+         blocks->transformBlockLocalToGlobalCell(globalCell, b);
+
+         if (globalCell.y() >= (cell_idx_c(1 << level)))
+         {
+            WALBERLA_CHECK_FLOAT_EQUAL_EPSILON(it.getF(0), inletVelocity, real_c(1e-5));
+         }
+      }
+   }
+   return EXIT_SUCCESS;
+}
diff --git a/tests/lbm_generated/FreeSlipRefinement.prm b/tests/lbm_generated/FreeSlipRefinement.prm
new file mode 100644
index 0000000000000000000000000000000000000000..9149c34d4ab19ff71af337e5be8ad80e2d1e714d
--- /dev/null
+++ b/tests/lbm_generated/FreeSlipRefinement.prm
@@ -0,0 +1,26 @@
+Parameters
+{
+	omega           1.95;
+	inletVelocity   0.05;
+	timesteps       1000;
+	refinementDepth 1;
+
+	remainingTimeLoggerFrequency 0; // in seconds
+	vtkWriteFrequency 0;
+}
+
+DomainSetup
+{
+   domainSize    <32, 16, 16>;
+   rootBlocks    <4, 2, 2>;
+
+   cellsPerBlock <  8, 8, 8 >;
+   periodic      <  0,    0, 1 >;
+}
+
+Logging
+{
+    logLevel info;  // info progress detail tracing
+    writeSetupForestAndReturn false;
+}
+
diff --git a/tests/lbm_generated/FreeSlipRefinement.py b/tests/lbm_generated/FreeSlipRefinement.py
new file mode 100644
index 0000000000000000000000000000000000000000..e695a8aedddea69bd9b8656de04eb87b76ce64fe
--- /dev/null
+++ b/tests/lbm_generated/FreeSlipRefinement.py
@@ -0,0 +1,49 @@
+import sympy as sp
+
+from pystencils import Target
+from pystencils import fields
+
+from lbmpy.advanced_streaming.utility import get_timesteps
+from lbmpy.boundaries import FreeSlip, UBB, ExtrapolationOutflow
+from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
+from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
+from pystencils_walberla import CodeGeneration, generate_info_header
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+
+with CodeGeneration() as ctx:
+    target = Target.CPU  # Target.GPU if ctx.cuda else Target.CPU
+    data_type = "float64" if ctx.double_accuracy else "float32"
+
+    streaming_pattern = 'pull'
+    timesteps = get_timesteps(streaming_pattern)
+
+    omega = sp.symbols("omega")
+
+    stencil = LBStencil(Stencil.D3Q27)
+    pdfs, vel_field = fields(f"pdfs({stencil.Q}), velocity({stencil.D}): {data_type}[{stencil.D}D]",
+                             layout='fzyx')
+
+    macroscopic_fields = {'velocity': vel_field}
+
+    lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega,
+                           streaming_pattern=streaming_pattern)
+    lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')
+
+    method = create_lb_method(lbm_config=lbm_config)
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+
+    freeslip = lbm_boundary_generator("FreeSlip", flag_uid="FreeSlip", boundary_object=FreeSlip(stencil))
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB([sp.Symbol("u_x"), 0.0, 0.0], data_type=data_type))
+    outflow = lbm_boundary_generator(class_name='Outflow', flag_uid='Outflow',
+                                     boundary_object=ExtrapolationOutflow(stencil[4], method),
+                                     field_data_type=data_type)
+
+    generate_lbm_package(ctx, name="FreeSlipRefinement",
+                         collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=True, boundaries=[freeslip, ubb, outflow],
+                         macroscopic_fields=macroscopic_fields,
+                         data_type=data_type, pdfs_data_type=data_type)
+
+    generate_info_header(ctx, 'FreeSlipRefinementInfoHeader')
diff --git a/tests/lbm_generated/InterpolationNoSlip.cpp b/tests/lbm_generated/InterpolationNoSlip.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cfbe99fcc87ba52fab85461a65326148af85a5da
--- /dev/null
+++ b/tests/lbm_generated/InterpolationNoSlip.cpp
@@ -0,0 +1,191 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file InterpolatedNoSlip.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//! \brief Couette flow driven by a UBB BC in Nord and wall boundary in the South. Remaining directions are periodic
+//!        If Interpolation BC are used the distance of the southern wall can be controlled. The velocity in the
+//!        first fluid cell is checked and compared with the velocity obtained from a default NoSlip BC.
+//!        Depending on the set distance for the interpolation BCs the velocity is expected to be higher or lower
+//
+//======================================================================================================================
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Initialization.h"
+#include "core/math/Vector3.h"
+#include "core/timing/RemainingTimeLogger.h"
+
+#include "field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/GhostLayerField.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include "lbm_generated/communication/UniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+
+// include the generated header file. It includes all generated classes
+#include "InterpolationNoSlipHeader.h"
+
+using namespace walberla;
+using namespace std::placeholders;
+
+using StorageSpecification_T = lbm::InterpolationNoSlipStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
+using PdfField_T             = lbm_generated::PdfField< StorageSpecification_T >;
+using PackInfo_T             = lbm_generated::UniformGeneratedPdfPackInfo< PdfField_T >;
+
+using SweepCollection_T = lbm::InterpolationNoSlipSweepCollection;
+
+using VectorField_T = GhostLayerField< real_t, StorageSpecification_T::Stencil::D >;
+using ScalarField_T = GhostLayerField< real_t, 1 >;
+
+using flag_t               = walberla::uint8_t;
+using FlagField_T          = FlagField< flag_t >;
+using BoundaryCollection_T = lbm::InterpolationNoSlipBoundaryCollection< FlagField_T >;
+
+using blockforest::communication::UniformBufferedScheme;
+
+class wallDistance
+{
+ public:
+   wallDistance(const real_t q) : q_(q) {}
+
+   real_t operator()(const Cell& fluidCell, const Cell& boundaryCell, const shared_ptr< StructuredBlockForest >& SbF,
+                     IBlock& block) const;
+
+ private:
+   const real_t q_;
+}; // class wallDistance
+
+real_t wallDistance::operator()(const Cell& /*fluidCell*/, const Cell& /*boundaryCell*/,
+                                const shared_ptr< StructuredBlockForest >& /*SbF*/, IBlock& /*block*/) const
+{
+   return q_;
+}
+
+int main(int argc, char** argv)
+{
+   debug::enterTestMode();
+   walberla::Environment walberlaEnv(argc, argv);
+   logging::configureLogging(walberlaEnv.config());
+
+   auto blocks = blockforest::createUniformBlockGridFromConfig(walberlaEnv.config());
+
+   auto domainSetup                = walberlaEnv.config()->getOneBlock("DomainSetup");
+   Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+
+   // read parameters
+   auto parameters   = walberlaEnv.config()->getOneBlock("Parameters");
+   const real_t omega        = parameters.getParameter< real_t >("omega", real_c(1.4));
+   const real_t distanceWall = parameters.getParameter< real_t >("distanceWall", real_c(0.5));
+   const uint_t timesteps    = parameters.getParameter< uint_t >("timesteps", uint_c(10)) + uint_c(1);
+
+   WALBERLA_LOG_DEVEL_VAR(distanceWall)
+
+   auto remainingTimeLoggerFrequency =
+      parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
+
+   const StorageSpecification_T StorageSpec = StorageSpecification_T();
+   BlockDataID pdfFieldId = lbm_generated::addPdfFieldToStorage(blocks, "pdf field", StorageSpec, uint_c(1));
+   BlockDataID velFieldId = field::addToStorage< VectorField_T >(blocks, "Velocity", real_c(0.0), field::fzyx);
+
+   BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field", uint_c(1));
+
+   SweepCollection_T sweepCollection(blocks, pdfFieldId, velFieldId, omega);
+   for (auto& block : *blocks)
+   {
+      sweepCollection.initialise(&block);
+   }
+
+   const FlagUID fluidFlagUID("Fluid");
+   auto boundariesConfig = walberlaEnv.config()->getBlock("Boundaries");
+   geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, boundariesConfig);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID);
+
+   const wallDistance wallDistanceCallback{ distanceWall };
+   std::function< real_t(const Cell&, const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) >
+      wallDistanceFunctor = wallDistanceCallback;
+   // For the BoundaryCollection a funcotr to calculate the wall distance for the Bouzidi NoSlip and for the QuadraticBB
+   // have to be provided. In this test case we use the same function to calculate the wall distance
+   BoundaryCollection_T boundaryCollection(blocks, flagFieldId, pdfFieldId, fluidFlagUID, omega, wallDistanceFunctor,
+                                           wallDistanceFunctor);
+
+   auto packInfo = std::make_shared< lbm_generated::UniformGeneratedPdfPackInfo< PdfField_T > >(pdfFieldId);
+   UniformBufferedScheme< Stencil_T > communication(blocks);
+   communication.addPackInfo(packInfo);
+
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+   timeloop.add() << BeforeFunction(communication, "communication")
+                  << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL), "Boundary Conditions");
+   timeloop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");
+
+   uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+   if (vtkWriteFrequency > 0)
+   {
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "InterpolationNoSlipVTK", vtkWriteFrequency, 0, false,
+                                                      "vtk_out", "simulation_step", false, true, true, false, 0);
+
+      auto velWriter = make_shared< field::VTKWriter< VectorField_T > >(velFieldId, "velocity");
+      vtkOutput->addBeforeFunction([&]() {
+         for (auto& block : *blocks)
+         {
+            sweepCollection.calculateMacroscopicParameters(&block);
+         }
+      });
+
+      vtkOutput->addCellDataWriter(velWriter);
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+
+   if (remainingTimeLoggerFrequency > 0)
+   {
+      // log remaining time
+      timeloop.addFuncAfterTimeStep(
+         timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
+         "remaining time logger");
+   }
+
+   timeloop.run();
+
+   // This is the velocity at the wall, when a NoSlip BC is used. This is similar to using an interpolation BC with a
+   // wall distance of 0.5. This value can be obtained by either setting distanceWall to 0.5 in the Parameter file or
+   // specifying the NoSlip BC at the southern boundary
+   const real_t defaultNoSlipVelocity = real_c(0.0002);
+
+   for (auto& block : *blocks)
+   {
+      sweepCollection.calculateMacroscopicParameters(&block);
+
+      auto velField  = block.getData< VectorField_T >(velFieldId);
+      auto velAtWall = velField->get(cell_idx_c(cellsPerBlock[0] / 2), 0, cell_idx_c(cellsPerBlock[2] / 2), 0);
+      // WALBERLA_LOG_DEVEL_VAR(velAtWall)
+
+      if (distanceWall > 0.49 && distanceWall < 0.51) { WALBERLA_CHECK_FLOAT_EQUAL(velAtWall, defaultNoSlipVelocity) }
+      else if (distanceWall < 0.49) { WALBERLA_CHECK_GREATER(defaultNoSlipVelocity, velAtWall) }
+      else if (distanceWall > 0.51) { WALBERLA_CHECK_LESS(defaultNoSlipVelocity, velAtWall) }
+   }
+
+   return EXIT_SUCCESS;
+}
diff --git a/tests/lbm_generated/InterpolationNoSlip.prm b/tests/lbm_generated/InterpolationNoSlip.prm
new file mode 100644
index 0000000000000000000000000000000000000000..4b15df27eb89235af707a5209abb37c31da50c61
--- /dev/null
+++ b/tests/lbm_generated/InterpolationNoSlip.prm
@@ -0,0 +1,30 @@
+Parameters
+{
+	omega           1.1;
+	timesteps       5000;
+    distanceWall    0.9;
+
+	remainingTimeLoggerFrequency 0; // in seconds
+	vtkWriteFrequency 0;
+}
+
+DomainSetup
+{
+    blocks        <  1,  1,  1 >;
+    cellsPerBlock < 50, 25, 25 >;
+    periodic      <  1,  0,  1 >;
+}
+
+Boundaries 
+{
+    // Border { direction S;    walldistance -1;  flag NoSlip; }
+    // Border { direction S;    walldistance -1;  flag NoSlipBouzidi; }
+    Border { direction S;    walldistance -1;  flag NoSlipQuadraticBB; }
+    Border { direction N;    walldistance -1;  flag UBB; }
+}
+
+
+Logging
+{
+    logLevel info;  // info progress detail tracing
+}
diff --git a/tests/lbm_generated/InterpolationNoSlip.py b/tests/lbm_generated/InterpolationNoSlip.py
new file mode 100644
index 0000000000000000000000000000000000000000..891892f43c3a5f0acbf187cd064ae0f85b559036
--- /dev/null
+++ b/tests/lbm_generated/InterpolationNoSlip.py
@@ -0,0 +1,54 @@
+import sympy as sp
+
+from pystencils import Target
+from pystencils import fields
+
+from lbmpy.advanced_streaming.utility import get_timesteps
+from lbmpy.boundaries import NoSlip, NoSlipLinearBouzidi, QuadraticBounceBack, UBB
+from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
+from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
+from pystencils_walberla import CodeGeneration, generate_info_header
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+
+import warnings
+
+warnings.filterwarnings("ignore")
+with CodeGeneration() as ctx:
+    target = Target.CPU  # Target.GPU if ctx.cuda else Target.CPU
+    data_type = "float64" if ctx.double_accuracy else "float32"
+
+    streaming_pattern = 'pull'
+    timesteps = get_timesteps(streaming_pattern)
+
+    omega = sp.symbols("omega")
+
+    stencil = LBStencil(Stencil.D3Q27)
+    pdfs, vel_field = fields(f"pdfs({stencil.Q}), velocity({stencil.D}): {data_type}[{stencil.D}D]",
+                             layout='fzyx')
+
+    macroscopic_fields = {'velocity': vel_field}
+
+    lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega,
+                           streaming_pattern=streaming_pattern)
+    lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')
+
+    method = create_lb_method(lbm_config=lbm_config)
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
+                                     boundary_object=NoSlip())
+    no_slip_bouzidi = lbm_boundary_generator(class_name='NoSlipBouzidi', flag_uid='NoSlipBouzidi',
+                                             boundary_object=NoSlipLinearBouzidi(data_type=data_type))
+    no_slip_quadraticbb = lbm_boundary_generator(class_name='NoSlipQuadraticBB', flag_uid='NoSlipQuadraticBB',
+                                                 boundary_object=QuadraticBounceBack(omega, data_type=data_type))
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB([0.01, 0, 0], data_type=data_type))
+
+    generate_lbm_package(ctx, name="InterpolationNoSlip",
+                         collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=True, boundaries=[no_slip, no_slip_bouzidi, no_slip_quadraticbb, ubb],
+                         macroscopic_fields=macroscopic_fields, data_type=data_type,
+                         set_pre_collision_pdfs=False)
+
+    generate_info_header(ctx, 'InterpolationNoSlipHeader')