diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9422543f7b89c190ab78a03daa43dc227d626a8f..c4dc0ffe6292e737ea319162a0efd7106a1a32fb 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -51,6 +51,7 @@ stages:
         -DWALBERLA_BUILD_WITH_METIS=$WALBERLA_BUILD_WITH_METIS
         -DWALBERLA_BUILD_WITH_PARMETIS=$WALBERLA_BUILD_WITH_PARMETIS
         -DWALBERLA_BUILD_WITH_FFTW=$WALBERLA_BUILD_WITH_FFTW
+        -DWALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT=$WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
         -DWALBERLA_ENABLE_GUI=$WALBERLA_ENABLE_GUI
         -DWALBERLA_BUILD_WITH_CODEGEN=$WALBERLA_BUILD_WITH_CODEGEN
         -DWALBERLA_STL_BOUNDS_CHECKS=$WALBERLA_STL_BOUNDS_CHECKS
@@ -269,9 +270,9 @@ icx_2022_hybrid_dbg_sp:
       - cuda11
       - docker
 
-gcc_9_serial:
+gcc_10_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-9
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -293,9 +294,9 @@ gcc_9_serial:
       - cuda11
       - docker
 
-gcc_9_mpionly:
+gcc_10_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-9
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -315,9 +316,9 @@ gcc_9_mpionly:
       - cuda11
       - docker
 
-gcc_9_hybrid:
+gcc_10_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-9
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -336,9 +337,9 @@ gcc_9_hybrid:
       - cuda11
       - docker
 
-gcc_9_serial_dbg:
+gcc_10_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-9
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -361,9 +362,9 @@ gcc_9_serial_dbg:
       - cuda11
       - docker
 
-gcc_9_mpionly_dbg:
+gcc_10_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-9
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -384,9 +385,9 @@ gcc_9_mpionly_dbg:
       - cuda11
       - docker
 
-gcc_9_hybrid_dbg:
+gcc_10_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-9
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -403,9 +404,9 @@ gcc_9_hybrid_dbg:
       - cuda11
       - docker
 
-gcc_9_hybrid_dbg_sp:
+gcc_10_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-9
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -428,9 +429,9 @@ gcc_9_hybrid_dbg_sp:
       - cuda11
       - docker
 
-gcc_10_serial:
+gcc_11_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -452,9 +453,9 @@ gcc_10_serial:
       - cuda11
       - docker
 
-gcc_10_mpionly:
+gcc_11_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -474,9 +475,9 @@ gcc_10_mpionly:
       - cuda11
       - docker
 
-gcc_10_hybrid:
+gcc_11_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -495,9 +496,9 @@ gcc_10_hybrid:
       - cuda11
       - docker
 
-gcc_10_serial_dbg:
+gcc_11_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -520,9 +521,9 @@ gcc_10_serial_dbg:
       - cuda11
       - docker
 
-gcc_10_mpionly_dbg:
+gcc_11_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -543,9 +544,9 @@ gcc_10_mpionly_dbg:
       - cuda11
       - docker
 
-gcc_10_hybrid_dbg:
+gcc_11_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -565,9 +566,9 @@ gcc_10_hybrid_dbg:
       - cuda11
       - docker
 
-gcc_10_hybrid_dbg_sp:
+gcc_11_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -590,9 +591,9 @@ gcc_10_hybrid_dbg_sp:
       - cuda11
       - docker
 
-gcc_11_serial:
+gcc_12_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -614,9 +615,9 @@ gcc_11_serial:
       - cuda11
       - docker
 
-gcc_11_mpionly:
+gcc_12_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -636,9 +637,9 @@ gcc_11_mpionly:
       - cuda11
       - docker
 
-gcc_11_hybrid:
+gcc_12_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -657,9 +658,9 @@ gcc_11_hybrid:
       - cuda11
       - docker
 
-gcc_11_serial_dbg:
+gcc_12_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -682,9 +683,9 @@ gcc_11_serial_dbg:
       - cuda11
       - docker
 
-gcc_11_mpionly_dbg:
+gcc_12_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -705,9 +706,9 @@ gcc_11_mpionly_dbg:
       - cuda11
       - docker
 
-gcc_11_hybrid_dbg:
+gcc_12_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -727,9 +728,9 @@ gcc_11_hybrid_dbg:
       - cuda11
       - docker
 
-gcc_11_hybrid_dbg_sp:
+gcc_12_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -752,14 +753,24 @@ gcc_11_hybrid_dbg_sp:
       - cuda11
       - docker
 
-gcc_12_serial:
+gcc_13_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13
+   before_script:
+      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - pip3 list
+      - cd ..
+      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -767,12 +778,22 @@ gcc_12_serial:
       - cuda11
       - docker
 
-gcc_12_mpionly:
+gcc_13_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13
+   before_script:
+      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - pip3 list
+      - cd ..
+      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
    only:
       variables:
          - $ENABLE_NIGHTLY_BUILDS
@@ -780,66 +801,116 @@ gcc_12_mpionly:
       - cuda11
       - docker
 
-gcc_12_hybrid:
+gcc_13_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13
    stage: pretest
+   before_script:
+      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - pip3 list
+      - cd ..
+      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
    tags:
       - cuda11
       - docker
 
-gcc_12_serial_dbg:
+gcc_13_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13
+   before_script:
+      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - pip3 list
+      - cd ..
+      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       WALBERLA_BUILD_WITH_MPI: "OFF"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
       WALBERLA_BUILD_WITH_PARMETIS: "OFF"
       CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
    tags:
       - cuda11
       - docker
 
-gcc_12_mpionly_dbg:
+gcc_13_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13
+   before_script:
+      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - pip3 list
+      - cd ..
+      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
    tags:
       - cuda11
       - docker
 
-gcc_12_hybrid_dbg:
+gcc_13_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13
+   before_script:
+      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - pip3 list
+      - cd ..
+      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
    tags:
       - cuda11
       - docker
 
-gcc_12_hybrid_dbg_sp:
+gcc_13_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13
+   before_script:
+      - pip3 install lbmpy==1.3.3 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - pip3 list
+      - cd ..
+      - CC=gcc CXX=g++ pip3 install cupy-cuda11x
    variables:
       WALBERLA_BUILD_WITH_CUDA: "ON"
       CMAKE_BUILD_TYPE: "DebugOptimized"
       WALBERLA_DOUBLE_ACCURACY: "OFF"
       WALBERLA_BUILD_WITH_PARMETIS: "OFF"
       WALBERLA_BUILD_WITH_METIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
    tags:
       - cuda11
       - docker
 
-clang_12_serial:
+clang_13_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -861,9 +932,9 @@ clang_12_serial:
       - cuda11
       - docker
 
-clang_12_mpionly:
+clang_13_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -883,9 +954,9 @@ clang_12_mpionly:
       - cuda11
       - docker
 
-clang_12_hybrid:
+clang_13_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -904,9 +975,9 @@ clang_12_hybrid:
       - cuda11
       - docker
 
-clang_12_serial_dbg:
+clang_13_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -929,9 +1000,9 @@ clang_12_serial_dbg:
       - cuda11
       - docker
 
-clang_12_mpionly_dbg:
+clang_13_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -952,9 +1023,9 @@ clang_12_mpionly_dbg:
       - cuda11
       - docker
 
-clang_12_hybrid_dbg:
+clang_13_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -971,9 +1042,9 @@ clang_12_hybrid_dbg:
       - cuda11
       - docker
 
-clang_12_hybrid_dbg_sp:
+clang_13_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-12
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -996,9 +1067,9 @@ clang_12_hybrid_dbg_sp:
       - cuda11
       - docker
 
-clang_13_serial:
+clang_14_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1020,9 +1091,9 @@ clang_13_serial:
       - cuda11
       - docker
 
-clang_13_mpionly:
+clang_14_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1042,9 +1113,9 @@ clang_13_mpionly:
       - cuda11
       - docker
 
-clang_13_hybrid:
+clang_14_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1063,9 +1134,9 @@ clang_13_hybrid:
       - cuda11
       - docker
 
-clang_13_serial_dbg:
+clang_14_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1088,9 +1159,9 @@ clang_13_serial_dbg:
       - cuda11
       - docker
 
-clang_13_mpionly_dbg:
+clang_14_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1111,9 +1182,9 @@ clang_13_mpionly_dbg:
       - cuda11
       - docker
 
-clang_13_hybrid_dbg:
+clang_14_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1133,9 +1204,9 @@ clang_13_hybrid_dbg:
       - cuda11
       - docker
 
-clang_13_hybrid_dbg_sp:
+clang_14_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-13
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1158,9 +1229,9 @@ clang_13_hybrid_dbg_sp:
       - cuda11
       - docker
 
-clang_14_serial:
+clang_15_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1182,9 +1253,9 @@ clang_14_serial:
       - cuda11
       - docker
 
-clang_14_mpionly:
+clang_15_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1204,9 +1275,9 @@ clang_14_mpionly:
       - cuda11
       - docker
 
-clang_14_hybrid:
+clang_15_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1225,9 +1296,9 @@ clang_14_hybrid:
       - cuda11
       - docker
 
-clang_14_serial_dbg:
+clang_15_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1250,9 +1321,9 @@ clang_14_serial_dbg:
       - cuda11
       - docker
 
-clang_14_mpionly_dbg:
+clang_15_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1273,9 +1344,9 @@ clang_14_mpionly_dbg:
       - cuda11
       - docker
 
-clang_14_hybrid_dbg:
+clang_15_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1295,9 +1366,9 @@ clang_14_hybrid_dbg:
       - cuda11
       - docker
 
-clang_14_hybrid_dbg_sp:
+clang_15_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1320,9 +1391,9 @@ clang_14_hybrid_dbg_sp:
       - cuda11
       - docker
 
-clang_15_serial:
+clang_16_serial:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1344,9 +1415,9 @@ clang_15_serial:
       - cuda11
       - docker
 
-clang_15_mpionly:
+clang_16_mpionly:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1366,9 +1437,9 @@ clang_15_mpionly:
       - cuda11
       - docker
 
-clang_15_hybrid:
+clang_16_hybrid:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1384,9 +1455,9 @@ clang_15_hybrid:
       - cuda11
       - docker
 
-clang_15_serial_dbg:
+clang_16_serial_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1406,9 +1477,9 @@ clang_15_serial_dbg:
       - cuda11
       - docker
 
-clang_15_mpionly_dbg:
+clang_16_mpionly_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1426,9 +1497,9 @@ clang_15_mpionly_dbg:
       - cuda11
       - docker
 
-clang_15_hybrid_dbg:
+clang_16_hybrid_dbg:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
       - cd python
@@ -1445,9 +1516,9 @@ clang_15_hybrid_dbg:
       - cuda11
       - docker
 
-clang_15_hybrid_dbg_sp:
+clang_16_hybrid_dbg_sp:
    extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16
    stage: pretest
    before_script:
       - pip3 install lbmpy==1.3.3 jinja2 pytest
@@ -1468,23 +1539,6 @@ clang_15_hybrid_dbg_sp:
       - cuda11
       - docker
 
-
-
-gcc_8_hybrid_dbg_noboost:
-   extends: .build_template
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:8
-   before_script:
-      - rm -rf /opt/boost /usr/include/boost
-   variables:
-      CMAKE_BUILD_TYPE: "DebugOptimized"
-      WALBERLA_BUILD_WITH_CUDA: "OFF"
-      WALBERLA_ENABLE_GUI: "OFF"
-      WALBERLA_BUILD_WITH_PYTHON: "OFF"
-   tags:
-      - docker
-
-
-
 ###############################################################################
 ##                                                                           ##
 ##    STL Debug Build                                                        ##
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5eae2503b9bd414976b0cbfce5478d2af4ea4a54..1d21694af654ba0b996a8bbce0c3096ef6a125ab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1276,12 +1276,6 @@ if ( WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT )
       message( FATAL_ERROR "Compiler: ${CMAKE_CXX_COMPILER} Version: ${CMAKE_CXX_COMPILER_VERSION} does not support half precision" )
    endif ()
 
-   # Local host optimization
-   if ( NOT WALBERLA_OPTIMIZE_FOR_LOCALHOST )
-      message( WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] "
-            "You are not optimizing for localhost. You may encounter linker errors, or WORSE: silent incorrect fp16 arithmetic! Consider also enabling WALBERLA_OPTIMIZE_FOR_LOCALHOST!" )
-   endif () # Local host check
-
 endif () # Check if WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT is set
 
 ############################################################################################################################
diff --git a/cmake/FindOpenMesh.cmake b/cmake/FindOpenMesh.cmake
index 4dbb7f7dd06fe02b5019b4a52e76287eafbc8651..76ca8682a1e6881a0564fb67234c8230362854eb 100644
--- a/cmake/FindOpenMesh.cmake
+++ b/cmake/FindOpenMesh.cmake
@@ -53,7 +53,7 @@
 #                                                                            
 #===========================================================================
 
-cmake_minimum_required(VERSION 3.3.0)
+cmake_minimum_required(VERSION 3.5.0)
 
 #if already found via finder or simulated finder in openmesh CMakeLists.txt, skip the search
 IF (NOT OPENMESH_FOUND) 
diff --git a/python/lbmpy_walberla/additional_data_handler.py b/python/lbmpy_walberla/additional_data_handler.py
index 8982ff04c926db34c3e2db5e599a5e44a9068450..daaab32fd40aff5f2150495bf4782ef61a282dae 100644
--- a/python/lbmpy_walberla/additional_data_handler.py
+++ b/python/lbmpy_walberla/additional_data_handler.py
@@ -159,7 +159,7 @@ class NoSlipLinearBouzidiAdditionalDataHandler(AdditionalDataHandler):
 
     @property
     def constructor_argument_name(self):
-        return "wallDistance"
+        return "wallDistanceBouzidi"
 
     @property
     def constructor_arguments(self):
@@ -168,7 +168,7 @@ class NoSlipLinearBouzidiAdditionalDataHandler(AdditionalDataHandler):
 
     @property
     def initialiser_list(self):
-        return "elementInitialiser(wallDistance),"
+        return f"elementInitialiser({self.constructor_argument_name}),"
 
     @property
     def additional_arguments_for_fill_function(self):
@@ -207,7 +207,7 @@ class QuadraticBounceBackAdditionalDataHandler(AdditionalDataHandler):
 
     @property
     def constructor_argument_name(self):
-        return "wallDistance"
+        return "wallDistanceQuadraticBB"
 
     @property
     def constructor_arguments(self):
@@ -216,7 +216,7 @@ class QuadraticBounceBackAdditionalDataHandler(AdditionalDataHandler):
 
     @property
     def initialiser_list(self):
-        return "elementInitialiser(wallDistance),"
+        return f"elementInitialiser({self.constructor_argument_name}),"
 
     @property
     def additional_arguments_for_fill_function(self):
diff --git a/src/core/DataTypes.h b/src/core/DataTypes.h
index 4e7c019a86dcf0ce23fb7c8a9e66e6add8500bde..d6147c12be61a072f112e49de6e68001a6013abc 100644
--- a/src/core/DataTypes.h
+++ b/src/core/DataTypes.h
@@ -167,6 +167,7 @@ using real_t = double;
 using real_t = float;
 #endif
 
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
 /// Half precision support. Experimental. Use carefully.
 ///
 /// This feature is experimental, since it strictly depends on the underlying architecture and compiler support.
@@ -174,7 +175,6 @@ using real_t = float;
 /// interchange. Arithmetic operations will likely involve casting to fp32 (C++ float) and truncation to fp16.
 /// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future.
 ///
-#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
 /// FIXME: (not really right) Clang version must be 15 or higher for x86 half precision support.
 /// FIXME: (not really right) GCC version must be 12 or higher for x86 half precision support.
 /// FIXME: (I don't know) Also support seems to require SSE, so ensure that respective instruction sets are enabled.
@@ -202,7 +202,7 @@ using half    = _Float16;
 // Another possible half precision format would be the one from Google Brain (bfloat16) with an 8 bit exponent and a 7 bit mantissa.
 // Compare https://i10git.cs.fau.de/ab04unyc/walberla/-/issues/23
 using float16 = half;
-#endif
+#endif // WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
 using float32 = float;
 using float64 = double;
 
@@ -276,7 +276,7 @@ inline bool floatIsEqual( walberla::float16 lhs, walberla::float16 rhs, const wa
    const auto difference = lhs - rhs;
    return ( (difference < 0) ? -difference : difference ) < epsilon;
 }
-#endif
+#endif // WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
 
 } // namespace walberla
 
diff --git a/src/core/mpi/CMakeLists.txt b/src/core/mpi/CMakeLists.txt
index 13cc3653bd41f03e1840abd09f71147eee3c57cd..2987e28c266bce6552659cce89f5ef9546927a76 100644
--- a/src/core/mpi/CMakeLists.txt
+++ b/src/core/mpi/CMakeLists.txt
@@ -19,11 +19,14 @@ target_sources( core
       MPIIO.h
       MPIManager.cpp
       MPIManager.h
+      MPIOperation.h
       MPITextFile.cpp
       MPITextFile.h
+      MPIWrapper.cpp
       MPIWrapper.h
       OpenMPBufferSystem.h
       OpenMPBufferSystem.impl.h
+      Operation.h
       RecvBuffer.h
       Reduce.h
       SendBuffer.h
diff --git a/src/core/mpi/Datatype.h b/src/core/mpi/Datatype.h
index f717cb6d94c661aec320a864c972dbba4a49d2ae..ac933bbe8c1e572520d0e67f75f8199c1adc8910 100644
--- a/src/core/mpi/Datatype.h
+++ b/src/core/mpi/Datatype.h
@@ -22,6 +22,7 @@
 #pragma once
 
 #include "MPIWrapper.h"
+#include "core/Abort.h"
 
 
 namespace walberla {
@@ -43,6 +44,21 @@ namespace mpi {
          WALBERLA_MPI_SECTION() { MPI_Type_commit( &mpiDatatype_ ); }
       }
 
+      explicit Datatype(const uint_t byteSize) : mpiDatatype_(MPI_DATATYPE_NULL)
+      {
+         WALBERLA_MPI_SECTION()
+         {
+            if (MPI_Type_contiguous(int_c(byteSize), MPI_BYTE, &mpiDatatype_) != MPI_SUCCESS)
+            {
+               WALBERLA_ABORT("MPI_Type_contiguous " << typeid(mpiDatatype_).name() << " failed.");
+            }
+            if (MPI_Type_commit(&mpiDatatype_) != MPI_SUCCESS)
+            {
+               WALBERLA_ABORT("MPI_Type_commit " << typeid(mpiDatatype_).name() << " failed.");
+            }
+         }
+      }
+
       void init( MPI_Datatype datatype )
       {
          mpiDatatype_ = datatype;
diff --git a/src/core/mpi/MPIManager.cpp b/src/core/mpi/MPIManager.cpp
index a334bc16c4878cea58a1452cd083fba31d9d3c7e..c25ca1082277d89c8be486ae7a9c61350d6bfea0 100644
--- a/src/core/mpi/MPIManager.cpp
+++ b/src/core/mpi/MPIManager.cpp
@@ -119,6 +119,10 @@ void MPIManager::finalizeMPI()
 {
    WALBERLA_MPI_SECTION()
    {
+      /// Free the custom types and operators
+      customMPITypes_.clear();
+      customMPIOperations_.clear();
+
       if (isMPIInitialized_ && !currentlyAborting_)
       {
          isMPIInitialized_ = false;
diff --git a/src/core/mpi/MPIManager.h b/src/core/mpi/MPIManager.h
index 9ba3fb4d04b8f6b0c7f1e2041454677b9156bf75..60ce4d8514e57bb2465a14dfa304cab73b3b8be6 100644
--- a/src/core/mpi/MPIManager.h
+++ b/src/core/mpi/MPIManager.h
@@ -18,23 +18,28 @@
 //! \author Florian Schornbaum <florian.schornbaum@fau.de>
 //! \author Martin Bauer <martin.bauer@fau.de>
 //! \author Christian Godenschwager <christian.godenschwager@fau.de>
+//! \author Michael Zikeli <michael.zikeli@fau.de>
 //
 //======================================================================================================================
 
 #pragma once
 
-#include "MPIWrapper.h"
 #include "core/DataTypes.h"
 #include "core/debug/Debug.h"
 #include "core/math/Uint.h"
+#include "core/mpi/Datatype.h"
+#include "core/mpi/MPIOperation.h"
+#include "core/mpi/MPIWrapper.h"
+#include "core/mpi/Operation.h"
 #include "core/singleton/Singleton.h"
 
+#include <map>
+#include <typeindex>
 
 namespace walberla {
 namespace mpi {
 
 
-
 /**
  * Encapsulates MPI Rank/Communicator information
  *
@@ -127,6 +132,87 @@ public:
    /// Indicates whether MPI-IO can be used with the current MPI communicator; certain versions of OpenMPI produce
    /// segmentation faults when using MPI-IO with a 3D Cartesian MPI communicator (see waLBerla issue #73)
    bool isCommMPIIOValid() const;
+
+   /// Return the custom MPI_Datatype stored in 'customMPITypes_' and defined by the user and passed to 'commitCustomType'.
+   template< typename CType >
+   MPI_Datatype getCustomType() const
+   {
+      WALBERLA_MPI_SECTION()
+      {
+         return customMPITypes_.at(typeid(CType)).operator MPI_Datatype();
+      }
+      WALBERLA_NON_MPI_SECTION()
+      {
+         WALBERLA_ABORT( "This should not be called, if waLBerla is compiled without MPI." );
+      }
+   }
+
+   /// Return the custom MPI_Op stored in 'customMPIOperation_' and defined by the user and passed to 'commitCustomOperation'.
+   template< typename CType >
+   MPI_Op getCustomOperation(mpi::Operation op) const
+   {
+      // FIXME the operation is actually type dependent but implementing this is not straightforward,
+      //    compare comment at declaration of 'customMPIOperations_'.
+      WALBERLA_MPI_SECTION()
+      {
+         return customMPIOperations_.at(op).operator MPI_Op();
+      }
+      WALBERLA_NON_MPI_SECTION()
+      {
+         WALBERLA_ABORT( "This should not be called, if waLBerla is compiled without MPI." );
+      }
+   }
+   //@}
+   //*******************************************************************************************************************
+
+   //** Setter Functions  **********************************************************************************************
+   /*! \name Setter Function */
+   //@{
+   ///! \brief Initializes a custom MPI_Datatype and logs it in the customMPITypes_ map.
+   ///! \param argument The argument that is expected by the constructor of mpi::Datatype
+   ///     At the point of creation 26.01.2024 this is either MPI_Datatype or const int.
+   template < typename CType, class ConstructorArgumentType >
+   void commitCustomType( ConstructorArgumentType& argument )
+   {
+      WALBERLA_MPI_SECTION()
+      {
+         if (isMPIInitialized_ && !currentlyAborting_)
+         {
+            static_assert( std::is_same_v<ConstructorArgumentType, const int> || std::is_same_v<ConstructorArgumentType, MPI_Datatype>,
+                           "mpi::Datatype has only an constructor for an int value or a MPI_Datatype." );
+            [[maybe_unused]] auto worked = std::get< 1 >( customMPITypes_.try_emplace(typeid(CType), argument) );
+            WALBERLA_ASSERT(worked, "Emplacement of type " << typeid(CType).name() << " did not work.");
+         } else {
+            WALBERLA_ABORT( "MPI must be initialized before an new MPI_Datatype can be committed." );
+         }
+      }
+      WALBERLA_NON_MPI_SECTION()
+      {
+         WALBERLA_ABORT( "This should not be called, if waLBerla is compiled without MPI." );
+      }
+   }
+
+   ///! \brief Initializes a custom MPI_Op and logs it in the customMPIOperation map
+   ///! \param op  A operator, e.g. SUM, MIN.
+   ///! \param fct The definition of the MPI_User_function used for this operator.
+   template< typename CType >
+   void commitCustomOperation( mpi::Operation op, MPI_User_function* fct )
+   {
+      WALBERLA_MPI_SECTION()
+      {
+         if (isMPIInitialized_ && !currentlyAborting_)
+         {
+            [[maybe_unused]] auto worked = std::get< 1 >(customMPIOperations_.try_emplace(op, fct));
+            WALBERLA_ASSERT(worked, "Emplacement of operation " << typeid(op).name() << " of type "
+                                                                << typeid(CType).name() << " did not work.");
+         }
+         else { WALBERLA_ABORT("MPI must be initialized before an new MPI_Op can be committed."); }
+      }
+      WALBERLA_NON_MPI_SECTION()
+      {
+         WALBERLA_ABORT( "This should not be called, if waLBerla is compiled without MPI." );
+      }
+   }
    //@}
    //*******************************************************************************************************************
 
@@ -163,6 +249,33 @@ private:
    // Singleton
    MPIManager() : comm_(MPI_COMM_NULL) { WALBERLA_NON_MPI_SECTION() { rank_ = 0; } }
 
+/// It is possible to commit own datatypes to MPI, that are not part of the standard. One example would be float16.
+/// With these maps, it is possible to track self defined MPI_Datatypes and MPI_Ops, to access them at any time and
+/// place in the program, also, they are automatically freed once MPIManager::finalizeMPI is called.
+/// To initialize types or operations and add them to the map, the getter functions 'commitCustomType' and
+/// 'commitCustomOperation' should be used. This can for example be done e.g. in the specialization of the MPITrait of
+/// the newly defined type. For an example see MPIWrapper.cpp
+   std::map< std::type_index, walberla::mpi::Datatype > customMPITypes_{};
+   std::map< walberla::mpi::Operation, walberla::mpi::MPIOperation > customMPIOperations_{};
+   // FIXME this must be type specific as well, but doing so is a bit more complicated.
+   //  1. Idea) Combining both maps together e.g. as std::map< typeid(CType),
+   //                                                          std::pair< MPI_DataType,
+   //                                                                     std::map< Operation,
+   //                                                                               MPI_Op > > > customMPITypesWithOps_{};
+   //           There the access is quite nasty to look at, but easily doable, the construction however is quite difficult
+   //           also one needs to make sure that the type is initialized before the operation.
+   //  2. Idea) Leaving it as two maps customMPITypes_ and customMPIOperations,
+   //           but storing a pair of typeid and operation as key for the operation map.
+   //           This way everything would look nice, but someone needs to implement a comparison operator for this pair.
+   //           I personally don't know where to put this comparison operator to, since it should not be part of the manager.
+   //  3. Idea) Since this relies on the use of MPITrait<CType> --> { MPI_Datatype, MPI_Op } someone could define a object
+   //           to store in the MPIManager there, to keep the MPIManager light and easily understandable.
+   //           I'm also not sure if the MPITrait is the right spot for this though.
+   //  For more information about the changes done in the code to allow custom defined types and operations,
+   //  check out MR !647 ( https://i10git.cs.fau.de/walberla/walberla/-/merge_requests/647 )
+
+
+
 }; // class MPIManager
 
 
diff --git a/src/core/mpi/MPIOperation.h b/src/core/mpi/MPIOperation.h
new file mode 100644
index 0000000000000000000000000000000000000000..3da98bfe8ea2bdb8783b02e997cdee486bb46a0d
--- /dev/null
+++ b/src/core/mpi/MPIOperation.h
@@ -0,0 +1,64 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file MPIOperation.h
+//! \ingroup core
+//! \author Michael Zikeli <michael.zikeli@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "core/Abort.h"
+#include "core/mpi/MPIWrapper.h"
+
+namespace walberla::mpi{
+
+//*******************************************************************************************************************
+/*! RAII class for MPI operators that commits and frees them
+*
+*/
+//*******************************************************************************************************************
+class MPIOperation
+{
+ public:
+   MPIOperation() = delete;
+
+   explicit MPIOperation( MPI_User_function* fct ) : mpiOperation_( MPI_OP_NULL )
+   {
+      WALBERLA_MPI_SECTION() {
+         if ( MPI_Op_create( fct, true, &mpiOperation_) != MPI_SUCCESS )
+         {
+            WALBERLA_ABORT("MPI_Op_create for " << typeid(mpiOperation_).name() << " failed." );
+         }
+      } // WALBERLA_MPI_SECTIION
+   }
+
+   ~MPIOperation()
+   {
+      WALBERLA_MPI_SECTION() { MPI_Op_free( & mpiOperation_ ); }
+   }
+
+   operator MPI_Op() const
+   {
+      return mpiOperation_;
+   }
+
+ protected:
+   MPI_Op mpiOperation_;
+};
+
+
+} // namespace walberla::mpi
\ No newline at end of file
diff --git a/src/core/mpi/MPIWrapper.cpp b/src/core/mpi/MPIWrapper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d68668676c932de2c564dd37312ca3645104e45
--- /dev/null
+++ b/src/core/mpi/MPIWrapper.cpp
@@ -0,0 +1,142 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file MPIWrapper.cpp
+//! \ingroup core
+//! \author Michael Zikeli <michael.zikeli@fau.de>
+//
+//======================================================================================================================
+
+#include "MPIWrapper.h"
+
+#include <set>
+
+#include "MPIManager.h"
+
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+namespace walberla
+{
+
+namespace mpi
+{
+namespace
+{
+/// These functions than can be used by self defined mpi operations, e.g. by using CustomMPIOperation.
+using float16_t = walberla::float16;
+// The signature of MPI_User_function looks like this
+// typedef void MPI_User_function( void *invec, void *inoutvec, int *len, MPI_Datatype *datatype);
+
+void sum(void* mpiRHSArray, void* mpiLHSArray, int* len, MPI_Datatype*)
+{
+   // cast mpi type to target c++ type
+   auto* rhs = (float16_t*) mpiRHSArray;
+   auto* lhs = (float16_t*) mpiLHSArray;
+   for (int i = 0; i < *len; ++i)
+   {
+      *lhs += *rhs;
+   }
+}
+
+void min(void* mpiRHSArray, void* mpiLHSArray, int* len, MPI_Datatype*)
+{
+   // cast mpi type to target c++ type
+   auto* rhs = (float16_t*) mpiRHSArray;
+   auto* lhs = (float16_t*) mpiLHSArray;
+   for (int i = 0; i < *len; ++i)
+   {
+      *lhs = (*rhs >= *lhs) ? *lhs : *rhs;
+   }
+}
+
+void max(void* mpiRHSArray, void* mpiLHSArray, int* len, MPI_Datatype*)
+{
+   // cast mpi type to target c++ type
+   auto* rhs = (float16_t*) mpiRHSArray;
+   auto* lhs = (float16_t*) mpiLHSArray;
+   for (int i = 0; i < *len; ++i)
+   {
+      *lhs = (*rhs <= *lhs) ? *lhs : *rhs;
+   }
+}
+
+MPI_User_function* returnMPIUserFctPointer(const Operation op)
+{
+   switch (op)
+   {
+   case SUM:
+      return &sum;
+   case MIN:
+      return &min;
+   case MAX:
+      return &max;
+   default:
+      WALBERLA_ABORT("The chosen operation " << typeid(op).name() << " is not implemented for float16 yet.");
+   }
+}
+
+}
+}
+
+/// Here some MPI_Datatypes and MPI_Ops are initialized that are not part of the MPI Standard and therefore have to be
+/// define yourself. This is done in the MPIManager, since they need to be freed before MPIFinalize is called and this
+/// way it is easiest to keep track of them.
+///     For more information about this feature compare MR !647 (
+///     https://i10git.cs.fau.de/walberla/walberla/-/merge_requests/647 )
+
+/*!
+ *  \brief Specialization of MPITrait for float16
+ *
+ *  The initialization of the self defined MPI_Datatype and MPI_Op is done in the MPIManager so that it can be freed
+ * before MPI is finalized.
+ */
+MPI_Datatype MPITrait< walberla::float16 >::type()
+{
+
+#ifdef WALBERLA_BUILD_WITH_MPI
+   // Since this type should be created only once, a static variable is used as safeguard.
+   static bool initializedType = false;
+   if (!initializedType)
+   {
+      // Since float16 consists of two Bytes, a continuous datatype with size of two byte is created.
+      mpi::MPIManager::instance()->commitCustomType< walberla::float16, const int >(2);
+      initializedType = true;
+   }
+   return mpi::MPIManager::instance()->getCustomType< walberla::float16 >();
+#else
+   return mpistubs::MPI_FLOAT16;
+#endif
+}
+
+MPI_Op MPITrait< walberla::float16 >::operation(const mpi::Operation& op)
+{
+   WALBERLA_MPI_SECTION()
+   {
+      // mpi::Operation is an enum and not an enum class, thus, it is not sufficient to make a just a bool variable as
+      // safeguard, since all operations are of type mpi::Operation and only the first one would pass the safeguard.
+      // Therefore, a set is created and each operation that is called the first time, will be initialized.
+      static std::set< mpi::Operation > operationInitializationRegister;
+      const bool needsInitialization = std::get< 1 >(operationInitializationRegister.emplace(op));
+      if (needsInitialization)
+      {
+         mpi::MPIManager::instance()->commitCustomOperation< walberla::float16 >(
+            op, mpi::returnMPIUserFctPointer(op));
+      }
+      return MPIManager::instance()->getCustomOperation< walberla::float16 >(op);
+   }
+   WALBERLA_NON_MPI_SECTION() { WALBERLA_ABORT("If MPI is not used, a custom operator should never be called."); }
+}
+
+} // namespace walberla
+#endif // WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
diff --git a/src/core/mpi/MPIWrapper.h b/src/core/mpi/MPIWrapper.h
index eecee3136c702a61e7987d25705076a7c780a635..baf0c62d8ba65bd7bd91a665d5d5261e8323d778 100644
--- a/src/core/mpi/MPIWrapper.h
+++ b/src/core/mpi/MPIWrapper.h
@@ -23,8 +23,7 @@
 #pragma once
 
 #include "core/Abort.h"
-
-
+#include "core/mpi/Operation.h"
 
 /// \cond internal
 
@@ -47,6 +46,7 @@
 
 #endif
 
+
 namespace walberla {
 namespace mpistubs {
     //empty namespace which can be used
@@ -77,8 +77,6 @@ namespace mpistubs {
 #pragma warning ( pop )
 #endif
 
-
-
 #else // WALBERLA_BUILD_WITH_MPI
 
 
@@ -143,6 +141,10 @@ const MPI_Datatype MPI_UNSIGNED_LONG_LONG = 10;
 const MPI_Datatype MPI_FLOAT              = 11;
 const MPI_Datatype MPI_DOUBLE             = 12;
 const MPI_Datatype MPI_LONG_DOUBLE        = 13;
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+   const MPI_Datatype MPI_FLOAT16  = 14;
+#endif
+
 
 const int MPI_ORDER_C       = 0;
 const int MPI_ORDER_FORTRAN = 0;
@@ -151,16 +153,17 @@ const MPI_Datatype MPI_ANY_SOURCE = -2;
 const MPI_Datatype MPI_ANY_TAG    = -1;
 const MPI_Datatype MPI_DATATYPE_NULL = 0;
 
-const MPI_Op MPI_MIN  = 100;
-const MPI_Op MPI_MAX  = 101;
-const MPI_Op MPI_SUM  = 102;
-const MPI_Op MPI_PROD = 103;
-const MPI_Op MPI_LAND = 104;
-const MPI_Op MPI_BAND = 105;
-const MPI_Op MPI_LOR  = 106;
-const MPI_Op MPI_BOR  = 107;
-const MPI_Op MPI_LXOR = 108;
-const MPI_Op MPI_BXOR = 109;
+const MPI_Op MPI_OP_NULL = 99;
+const MPI_Op MPI_MIN     = 100;
+const MPI_Op MPI_MAX     = 101;
+const MPI_Op MPI_SUM     = 102;
+const MPI_Op MPI_PROD    = 103;
+const MPI_Op MPI_LAND    = 104;
+const MPI_Op MPI_BAND    = 105;
+const MPI_Op MPI_LOR     = 106;
+const MPI_Op MPI_BOR     = 107;
+const MPI_Op MPI_LXOR    = 108;
+const MPI_Op MPI_BXOR    = 109;
 
 const int MPI_PACKED = 1;
 const int MPI_UNDEFINED = -1;
@@ -265,6 +268,7 @@ inline int MPI_Type_get_extent(MPI_Datatype, MPI_Aint*, MPI_Aint*) { WALBERLA_MP
 inline int MPI_Type_create_struct(int, const int[], const MPI_Aint[], const MPI_Datatype[], MPI_Datatype*) { WALBERLA_MPI_FUNCTION_ERROR }
 
 inline int MPI_Op_create(MPI_User_function*, int, MPI_Op*) { WALBERLA_MPI_FUNCTION_ERROR }
+inline int MPI_Op_free(MPI_Op*) { WALBERLA_MPI_FUNCTION_ERROR }
 
 inline int MPI_Get_processor_name( char*, int* ) { WALBERLA_MPI_FUNCTION_ERROR }
 
@@ -307,58 +311,104 @@ namespace mpi {
 
 
 
+
+
 /*!\class MPITrait
 // \brief Base template for the MPITrait class
 //
 // The MPITrait class template offers a translation between the C++ built-in data types and
-// the corresponding MPI data types required for send and receive operations. For a particular
-// MPITrait instantiation, the corresponding MPI data type can be obtained by calling type()
-// of the MPITrait. The following example demonstrates the application of the MPITrait class:
+// the corresponding MPI data types its respective operation required for send, receive and reduce operations.
+// For a particular MPITrait instantiation, the corresponding MPI data type can be obtained by calling type()
+// as well as calling operation( const Operation& ) to the MPI operation corresponding to the MPI data type.
+// The following example demonstrates the application of the MPITrait class:
 
-   \code
+\code
    // Initialization of the MPI communication
    int* pi;  // Integer buffer for the MPI send operation
-   ...       // Initialization of the send buffer
+...       // Initialization of the send buffer
 
    // Sending 50 integers to process 0
    MPI_Send( pi, 50, MPITrait< int >::type(), 0, 0, MPI_COMM_WORLD );
-   \endcode
-*/
+\endcode
+      */
 template< typename T >
-struct MPITrait;
-
-
-
-/// Macro for the creation of MPITrait specializations for the supported data types.
+struct MPITrait
+{
+   static inline MPI_Datatype type();
+   static inline MPI_Op operation(const mpi::Operation&      );
+};
 
-#define WALBERLA_CREATE_MPITRAIT_SPECIALIZATION(CPP_TYPE,MPI_TYPE) \
+/// Macro for specialization of the MPI supported data types in MPITrait::type().
+#define WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(CPP_TYPE, MPI_TYPE) \
    template<> \
-   struct MPITrait< CPP_TYPE > \
+   inline MPI_Datatype MPITrait< CPP_TYPE >::type() \
    { \
-      static inline MPI_Datatype type() { return (MPI_TYPE); } \
+      return (MPI_TYPE); \
    }
 
-
-
 // MPITRAIT SPECIALIZATIONS
 
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( char               , MPI_CHAR               );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( signed char        , MPI_CHAR               );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( signed short int   , MPI_SHORT              );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( signed int         , MPI_INT                );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( signed long int    , MPI_LONG               );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( signed long long   , MPI_LONG_LONG          );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned char      , MPI_UNSIGNED_CHAR      );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned short int , MPI_UNSIGNED_SHORT     );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned int       , MPI_UNSIGNED           );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned long int  , MPI_UNSIGNED_LONG      );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned long long , MPI_UNSIGNED_LONG_LONG );
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(char, MPI_CHAR)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(signed char, MPI_CHAR)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(signed short int, MPI_SHORT)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(signed int, MPI_INT)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(signed long int, MPI_LONG)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(signed long long, MPI_LONG_LONG)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(unsigned char, MPI_UNSIGNED_CHAR)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(unsigned short int, MPI_UNSIGNED_SHORT)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(unsigned int, MPI_UNSIGNED)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(unsigned long int, MPI_UNSIGNED_LONG)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(unsigned long long, MPI_UNSIGNED_LONG_LONG)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(float, MPI_FLOAT)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(double, MPI_DOUBLE)
+WALBERLA_CREATE_MPITRAIT_TYPE_SPECIALIZATION(long double, MPI_LONG_DOUBLE)
 #ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
-   WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( walberla::float16  , MPI_WCHAR              );
+template<> struct MPITrait< float16 >{
+   static MPI_Datatype type();
+   static MPI_Op operation(const mpi::Operation&      );
+};
+#endif
+
+
+/*!
+ *  \brief Specialization of the static operation() method of MPITrait.
+ *
+ *  It chooses a MPI_Op depending on the value type of the object the operation is performed on.
+ *
+ *  \param op The operation to be performed (op is an element of the enum mpi::Operation).
+ */
+template< typename T >
+MPI_Op MPITrait< T >::operation(const mpi::Operation& op)
+{
+   switch (op)
+   {
+   case mpi::MIN:
+      return MPI_MIN;
+   case mpi::MAX:
+      return MPI_MAX;
+   case mpi::SUM:
+      return MPI_SUM;
+   case mpi::PRODUCT:
+      return MPI_PROD;
+   case mpi::LOGICAL_AND:
+      return MPI_LAND;
+   case mpi::BITWISE_AND:
+      return MPI_BAND;
+   case mpi::LOGICAL_OR:
+      return MPI_LOR;
+   case mpi::BITWISE_OR:
+      return MPI_BOR;
+   case mpi::LOGICAL_XOR:
+      return MPI_LXOR;
+   case mpi::BITWISE_XOR:
+      return MPI_BXOR;
+   default:
+      WALBERLA_ABORT("Unknown operation!");
+   }
+#ifdef __IBMCPP__
+   return MPI_SUM; // never reached, helps to suppress a warning from the IBM compiler
 #endif
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( float              , MPI_FLOAT              );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( double             , MPI_DOUBLE             );
-WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( long double        , MPI_LONG_DOUBLE        );
+}
 
 } // namespace walberla
 /// \endcond
diff --git a/src/core/mpi/Operation.h b/src/core/mpi/Operation.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3097bb89fe4f3fcb3f4581d8435afe2b2b70ec8
--- /dev/null
+++ b/src/core/mpi/Operation.h
@@ -0,0 +1,27 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Operation.h
+//! \ingroup core
+//! \author Michael Zikeli <michael.zikeli@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+namespace walberla::mpi
+{
+// Note: I don't like at all that this is an enum and not an enum class, but changing this would be a major change in the framework.
+enum Operation { MIN, MAX, SUM, PRODUCT, LOGICAL_AND, BITWISE_AND, LOGICAL_OR, BITWISE_OR, LOGICAL_XOR, BITWISE_XOR };
+} // namespace walberla::mpi
\ No newline at end of file
diff --git a/src/core/mpi/Reduce.h b/src/core/mpi/Reduce.h
index 5e9bb8220112ff4bff9c19a02e66cb3c2d801d46..a0b6edb39cad16fdfa5c527f83f9fe007ba9fa2c 100644
--- a/src/core/mpi/Reduce.h
+++ b/src/core/mpi/Reduce.h
@@ -16,18 +16,19 @@
 //! \file Reduce.h
 //! \ingroup core
 //! \author Christian Godenschwager <christian.godenschwager@fau.de>
+//! \author Michael Zikeli <michael.zikeli@fau.de>
 //
 //======================================================================================================================
 
 #pragma once
 
-#include "BufferDataTypeExtensions.h"
-
 #include "core/Abort.h"
 #include "core/DataTypes.h"
 #include "core/debug/Debug.h"
-#include "core/mpi/MPIManager.h"
 #include "core/mpi/MPIWrapper.h"
+#include "core/mpi/Operation.h"
+
+#include "BufferDataTypeExtensions.h"
 
 #include "core/math/Vector3.h"
 
@@ -36,33 +37,10 @@
 
 
 namespace walberla {
-namespace mpi {
 
 
-
-enum Operation { MIN, MAX, SUM, PRODUCT, LOGICAL_AND, BITWISE_AND, LOGICAL_OR, BITWISE_OR, LOGICAL_XOR, BITWISE_XOR };
-
-inline MPI_Op toMPI_Op( Operation operation )
+namespace mpi
 {
-   switch( operation )
-   {
-   case MIN:         return MPI_MIN;
-   case MAX:         return MPI_MAX;
-   case SUM:         return MPI_SUM;
-   case PRODUCT:     return MPI_PROD;
-   case LOGICAL_AND: return MPI_LAND;
-   case BITWISE_AND: return MPI_BAND;
-   case LOGICAL_OR:  return MPI_LOR;
-   case BITWISE_OR:  return MPI_BOR;
-   case LOGICAL_XOR: return MPI_LXOR;
-   case BITWISE_XOR: return MPI_BXOR;
-   default:          WALBERLA_ABORT( "Unknown operation!" );
-   }
-#ifdef __IBMCPP__
-   return MPI_SUM; // never reached, helps to suppress a warning from the IBM compiler
-#endif
-}
-
 //======================================================================================================================
 /*!
  *  \brief Reduces a value over all processes in-place
@@ -91,11 +69,11 @@ void reduceInplace( T & value, Operation operation, int recvRank = 0, MPI_Comm c
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( MPI_IN_PLACE, &value, 1, MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( MPI_IN_PLACE, &value, 1, MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
    else
    {
-      MPI_Reduce( &value, nullptr, 1, MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( &value, nullptr, 1, MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
 }
 
@@ -128,11 +106,11 @@ inline void reduceInplace( bool & value, Operation operation, int recvRank = 0,
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( MPI_IN_PLACE, &intValue, 1, MPITrait<int>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( MPI_IN_PLACE, &intValue, 1, MPITrait<int>::type(), MPITrait<int>::operation(operation), recvRank, comm );
    }
    else
    {
-      MPI_Reduce( &intValue, nullptr, 1, MPITrait<int>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( &intValue, nullptr, 1, MPITrait<int>::type(), MPITrait<int>::operation(operation), recvRank, comm );
    }
 
    value = intValue != 0;
@@ -172,11 +150,11 @@ T reduce( const T value, Operation operation, int recvRank = 0, MPI_Comm comm =
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( const_cast<T*>( &value ), &result, 1, MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( const_cast<T*>( &value ), &result, 1, MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
    else
    {
-      MPI_Reduce( const_cast<T*>( &value ), nullptr, 1, MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( const_cast<T*>( &value ), nullptr, 1, MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
 
    return result;
@@ -213,11 +191,11 @@ inline bool reduce( const bool value, Operation operation, int recvRank = 0, MPI
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( &intValue, &intResult, 1, MPITrait<int>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( &intValue, &intResult, 1, MPITrait<int>::type(), MPITrait<int>::operation(operation), recvRank, comm );
    }
    else
    {
-      MPI_Reduce( &intValue, nullptr, 1, MPITrait<int>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( &intValue, nullptr, 1, MPITrait<int>::type(), MPITrait<int>::operation(operation), recvRank, comm );
    }
 
    return intResult != 0;
@@ -252,11 +230,11 @@ void reduceInplace( std::vector<T> & values, Operation operation, int recvRank =
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( MPI_IN_PLACE, values.empty() ? nullptr : &values[0], int_c( values.size() ), MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( MPI_IN_PLACE, values.empty() ? nullptr : &values[0], int_c( values.size() ), MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
    else
    {
-      MPI_Reduce( values.empty() ? nullptr : &values[0], nullptr, int_c( values.size() ), MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( values.empty() ? nullptr : &values[0], nullptr, int_c( values.size() ), MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
 }
 
@@ -292,14 +270,14 @@ inline void reduceInplace( std::vector<bool> & values, Operation operation, int
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( MPI_IN_PLACE, sendBuffer.empty() ? nullptr : &sendBuffer[0], int_c( sendBuffer.size() ), MPITrait<uint8_t>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( MPI_IN_PLACE, sendBuffer.empty() ? nullptr : &sendBuffer[0], int_c( sendBuffer.size() ), MPITrait<uint8_t>::type(), MPITrait<uint8_t>::operation(operation), recvRank, comm );
       size_t size = values.size();
       convert( sendBuffer, values );
       values.resize(size);
    }
    else
    {
-      MPI_Reduce( sendBuffer.empty() ? nullptr : &sendBuffer[0], nullptr, int_c( sendBuffer.size() ), MPITrait<uint8_t>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( sendBuffer.empty() ? nullptr : &sendBuffer[0], nullptr, int_c( sendBuffer.size() ), MPITrait<uint8_t>::type(), MPITrait<uint8_t>::operation(operation), recvRank, comm );
    }
 }
 
@@ -331,11 +309,11 @@ void reduceInplace( math::Vector3<T> & values, Operation operation, int recvRank
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( MPI_IN_PLACE, values.data(), 3, MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( MPI_IN_PLACE, values.data(), 3, MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
    else
    {
-      MPI_Reduce( values.data(), nullptr, 3, MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( values.data(), nullptr, 3, MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
 }
 
@@ -367,11 +345,11 @@ inline void reduceInplace( math::Vector3<bool> & values, Operation operation, in
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( MPI_IN_PLACE, intValues.data(), 3, MPITrait<int>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( MPI_IN_PLACE, intValues.data(), 3, MPITrait<int>::type(), MPITrait<int>::operation(operation), recvRank, comm );
    }
    else
    {
-      MPI_Reduce( intValues.data(), nullptr, 3, MPITrait<int>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( intValues.data(), nullptr, 3, MPITrait<int>::type(), MPITrait<int>::operation(operation), recvRank, comm );
    }
 
    for(uint_t i = 0; i < 3; ++i)
@@ -411,11 +389,11 @@ math::Vector3<T> reduce( const math::Vector3<T> & values, Operation operation, i
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( const_cast<T*>( values.data() ), result.data(), 3, MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( const_cast<T*>( values.data() ), result.data(), 3, MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
    else
    {
-      MPI_Reduce( const_cast<T*>( values.data() ), nullptr, 3, MPITrait<T>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( const_cast<T*>( values.data() ), nullptr, 3, MPITrait<T>::type(), MPITrait<T>::operation(operation), recvRank, comm );
    }
 
    return result;
@@ -452,14 +430,14 @@ inline math::Vector3<bool> reduce( const math::Vector3<bool> & values, Operation
 
    if( myRank == recvRank )
    {
-      MPI_Reduce( MPI_IN_PLACE, intValues.data(), 3, MPITrait<int>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( MPI_IN_PLACE, intValues.data(), 3, MPITrait<int>::type(), MPITrait<int>::operation(operation), recvRank, comm );
 
       for(uint_t i = 0; i < 3; ++i)
          results[i] = intValues[i] != 0;
    }
    else
    {
-      MPI_Reduce( intValues.data(), nullptr, 3, MPITrait<int>::type(), toMPI_Op(operation), recvRank, comm );
+      MPI_Reduce( intValues.data(), nullptr, 3, MPITrait<int>::type(), MPITrait<int>::operation(operation), recvRank, comm );
    }
 
    return results;
@@ -487,7 +465,7 @@ T allReduce( const T & value, Operation operation, MPI_Comm comm = MPI_COMM_WORL
    WALBERLA_NON_MPI_SECTION() { return value; }
 
    T result;
-   MPI_Allreduce( const_cast<T*>( &value ), &result, 1, MPITrait<T>::type(), toMPI_Op(operation), comm );
+   MPI_Allreduce( const_cast<T*>( &value ), &result, 1, MPITrait<T>::type(), MPITrait<T>::operation(operation), comm );
    return result;
 }
 
@@ -514,7 +492,7 @@ inline bool allReduce( const bool value, Operation operation, MPI_Comm comm = MP
 
    int intValue = value ? 1 : 0;
 
-   MPI_Allreduce( MPI_IN_PLACE, &intValue, 1, MPITrait<int>::type(), toMPI_Op(operation), comm );
+   MPI_Allreduce( MPI_IN_PLACE, &intValue, 1, MPITrait<int>::type(), MPITrait<int>::operation(operation), comm );
 
    return intValue != 0;
 }
@@ -539,7 +517,7 @@ void allReduceInplace( T & value, Operation operation, MPI_Comm comm = MPI_COMM_
 
    WALBERLA_NON_MPI_SECTION() { return; }
 
-   MPI_Allreduce( MPI_IN_PLACE, &value, 1, MPITrait<T>::type(), toMPI_Op(operation), comm );
+   MPI_Allreduce( MPI_IN_PLACE, &value, 1, MPITrait<T>::type(), MPITrait<T>::operation(operation), comm );
 }
 
 
@@ -562,7 +540,7 @@ inline void allReduceInplace( bool & value, Operation operation, MPI_Comm comm =
 
    int intValue = value ? 1 : 0;
 
-   MPI_Allreduce( MPI_IN_PLACE, &intValue, 1, MPITrait<int>::type(), toMPI_Op(operation), comm );
+   MPI_Allreduce( MPI_IN_PLACE, &intValue, 1, MPITrait<int>::type(), MPITrait<int>::operation(operation), comm );
 
    value = intValue != 0;
 }
@@ -587,7 +565,7 @@ void allReduceInplace( std::vector<T> & values, Operation operation, MPI_Comm co
 
    WALBERLA_NON_MPI_SECTION() { return; }
 
-   MPI_Allreduce( MPI_IN_PLACE, values.empty() ? nullptr : &values[0], int_c( values.size() ), MPITrait<T>::type(), toMPI_Op(operation), comm );
+   MPI_Allreduce( MPI_IN_PLACE, values.empty() ? nullptr : &values[0], int_c( values.size() ), MPITrait<T>::type(), MPITrait<T>::operation(operation), comm );
 }
 
 
@@ -612,7 +590,7 @@ inline void allReduceInplace( std::vector<bool> & bools, Operation operation, MP
    std::vector<uint8_t> sendBuffer;
 
    convert( bools, sendBuffer );
-   MPI_Allreduce( MPI_IN_PLACE, sendBuffer.empty() ? nullptr : &sendBuffer[0], int_c( sendBuffer.size() ), MPITrait<uint8_t>::type(), toMPI_Op(operation), comm );
+   MPI_Allreduce( MPI_IN_PLACE, sendBuffer.empty() ? nullptr : &sendBuffer[0], int_c( sendBuffer.size() ), MPITrait<uint8_t>::type(), MPITrait<uint8_t>::operation(operation), comm );
    auto size = bools.size();
    convert(sendBuffer, bools);
    bools.resize(size);
@@ -637,7 +615,7 @@ void allReduceInplace( math::Vector3<T> & values, Operation operation, MPI_Comm
 
    WALBERLA_NON_MPI_SECTION() { return; }
 
-   MPI_Allreduce( MPI_IN_PLACE, values.data(), 3, MPITrait<T>::type(), toMPI_Op(operation), comm );
+   MPI_Allreduce( MPI_IN_PLACE, values.data(), 3, MPITrait<T>::type(), MPITrait<T>::operation(operation), comm );
 }
 
 
@@ -663,7 +641,7 @@ inline void allReduceInplace( math::Vector3<bool> & bools, Operation operation,
 
    math::Vector3<int> intValues{bools[0] ? 1 : 0, bools[1] ? 1 : 0, bools[2] ? 1 : 0};
 
-   MPI_Allreduce( MPI_IN_PLACE, intValues.data(), 3, MPITrait<int>::type(), toMPI_Op(operation), comm );
+   MPI_Allreduce( MPI_IN_PLACE, intValues.data(), 3, MPITrait<int>::type(), MPITrait<int>::operation(operation), comm );
 
    for(uint_t i = 0; i < 3; ++i)
    {
diff --git a/src/core/timing/Timer.h b/src/core/timing/Timer.h
index 9f7c3f97d1066ff7ffa322cb5d6550c9f5d5013b..efc7016409a6e91d8c83b96aebfbd9fc40c4a8cf 100644
--- a/src/core/timing/Timer.h
+++ b/src/core/timing/Timer.h
@@ -30,6 +30,7 @@
 #include "WcPolicy.h"
 
 #include "core/DataTypes.h"
+#include "core/mpi/MPIManager.h"
 #include "core/mpi/RecvBuffer.h"
 #include "core/mpi/Reduce.h"
 #include "core/mpi/SendBuffer.h"
@@ -527,7 +528,7 @@ shared_ptr<Timer<TP> > getReduced( Timer<TP>& timer, ReduceType rt, int targetRa
    }
 
    //uint_t counter, double min, double max, double total, double sumOfSquares
-   if ( targetRank < 0 || targetRank == MPIManager::instance()->worldRank() )
+   if ( targetRank < 0 || targetRank == mpi::MPIManager::instance()->worldRank() )
       return make_shared<Timer<TP> >( mpi::MPIManager::instance()->numProcesses(), min, max, total, sumOfSquares  );
 
    return nullptr;
diff --git a/src/domain_decomposition/BlockStorage.h b/src/domain_decomposition/BlockStorage.h
index b59f7b30fe803a2c9abbe48c8856c9b4bbf2be20..2f29acdb7fd45c1e287ebf6d4601b641f303506b 100644
--- a/src/domain_decomposition/BlockStorage.h
+++ b/src/domain_decomposition/BlockStorage.h
@@ -723,6 +723,12 @@ inline void BlockStorage::clearBlockData( const BlockDataID & id )
 {
    for( auto block = begin(); block != end(); ++block )
       block->deleteData( id );
+
+   //also delete block data from data handling vector
+   auto elementToErase = std::remove_if(blockDataItem_.begin(), blockDataItem_.end(),
+                                 [id](const internal::BlockDataItem& dataItem)
+                                 { return dataItem.getId() == id; });
+   blockDataItem_.erase(elementToErase, blockDataItem_.end());
 }
 
 
diff --git a/src/field/distributors/KernelDistributor.h b/src/field/distributors/KernelDistributor.h
index 712d9a7b7e32ae96b0d259873d102166675b22b3..bc0abb8fa5882e51960d936132cd3613d55f5172 100644
--- a/src/field/distributors/KernelDistributor.h
+++ b/src/field/distributors/KernelDistributor.h
@@ -62,7 +62,7 @@ public:
       WALBERLA_ASSERT(baseField.nrOfGhostLayers() > uint_t(0), "field for kernel distribution needs at least one ghost layer");
    }
 
-   inline bool operator==( const OwnType & other ){ return baseField_ == other.baseField_; }
+   inline bool operator==( const OwnType & other ) const { return baseField_ == other.baseField_; }
 
    template< typename ForwardIterator_T >
    inline void distribute( const Vector3<real_t> & position, ForwardIterator_T distributeValueBegin )
diff --git a/src/field/distributors/NearestNeighborDistributor.h b/src/field/distributors/NearestNeighborDistributor.h
index 932f443b3e30af9ab6a2bc4f6c6f3f585e485473..c4819cb9119d46e46dbc34bed40228664bb5cffb 100644
--- a/src/field/distributors/NearestNeighborDistributor.h
+++ b/src/field/distributors/NearestNeighborDistributor.h
@@ -59,7 +59,7 @@ public:
    : blockStorage_( blockStorage ), block_( block ), baseField_( baseField ), flagField_( flagField ), evaluationMask_( evaluationMask )
    {}
 
-   inline bool operator==( const OwnType & other ){ return baseField_ == other.baseField_; }
+   inline bool operator==( const OwnType & other ) const { return baseField_ == other.baseField_; }
 
    template< typename ForwardIterator_T >
    inline void distribute( const Vector3<real_t> & position, ForwardIterator_T distributeValueBegin )
diff --git a/src/field/interpolators/KernelFieldInterpolator.h b/src/field/interpolators/KernelFieldInterpolator.h
index 0e59fabf21dfd4899e62ca29961811a0e99e59d9..0f5987e76e844df9505869fbaab3feadacbc88fa 100644
--- a/src/field/interpolators/KernelFieldInterpolator.h
+++ b/src/field/interpolators/KernelFieldInterpolator.h
@@ -105,7 +105,7 @@ public:
    }
 
 
-   inline bool operator==( const OwnType & other ){ return baseField_ == other.baseField_; }
+   inline bool operator==( const OwnType & other ) const { return baseField_ == other.baseField_; }
 
    template< typename ForwardIterator_T >
    inline void get( const Vector3<real_t> & position, ForwardIterator_T interpolationResultBegin )
diff --git a/src/field/interpolators/NearestNeighborFieldInterpolator.h b/src/field/interpolators/NearestNeighborFieldInterpolator.h
index b5b5cba7f65a3e06517356933d157108ba81e90c..bb08276f9f12949a10461e96b1a815acd64f2559 100644
--- a/src/field/interpolators/NearestNeighborFieldInterpolator.h
+++ b/src/field/interpolators/NearestNeighborFieldInterpolator.h
@@ -57,7 +57,7 @@ public:
    {}
 
 
-   inline bool operator==( const OwnType & other ){ return baseField_ == other.baseField_; }
+   inline bool operator==( const OwnType & other ) const { return baseField_ == other.baseField_; }
 
    template< typename ForwardIterator_T >
    inline void get( const Vector3<real_t> & position, ForwardIterator_T interpolationResultBegin )
diff --git a/src/field/interpolators/TrilinearFieldInterpolator.h b/src/field/interpolators/TrilinearFieldInterpolator.h
index e9809d835f1bf67e89da8a0eb2b5a9493624c84e..351ed7afea09db1c75feeb59dd771afb0796949e 100644
--- a/src/field/interpolators/TrilinearFieldInterpolator.h
+++ b/src/field/interpolators/TrilinearFieldInterpolator.h
@@ -62,7 +62,7 @@ public:
    }
 
 
-   inline bool operator==( const OwnType & other ){ return baseField_ == other.baseField_; }
+   inline bool operator==( const OwnType & other ) const { return baseField_ == other.baseField_; }
 
    template< typename ForwardIterator_T >
    inline void get( const Vector3<real_t> & position, ForwardIterator_T interpolationResultBegin )
diff --git a/src/gpu/FieldAccessor.h b/src/gpu/FieldAccessor.h
index cd50cc58d6e1c6ef708a1cc50e7fbcc897933281..fc1214e081c0822fdc1370ad9e69f44c2e986594 100644
--- a/src/gpu/FieldAccessor.h
+++ b/src/gpu/FieldAccessor.h
@@ -78,7 +78,7 @@ namespace gpu
       __device__ __forceinline__ bool isValidPosition()  { return true; }
 
       __device__ T & get()       { return * (T*)(ptr_);                }
-      __device__ T & get( int f) { return * (T*)(ptr_ + f * fOffset_); }
+      __device__ T & get( uint_t f) { return * (T*)(ptr_ + f * fOffset_); }
 
 
       __device__ T & getNeighbor( int cx, int cy, int cz ) const
@@ -88,7 +88,7 @@ namespace gpu
                                cz * zOffset_ );
       }
 
-      __device__ T & getNeighbor( int cx, int cy, int cz, int cf )
+      __device__ T & getNeighbor( int cx, int cy, int cz, uint_t cf )
       {
          return * (T*)( ptr_ + cx * xOffset_ +
                                cy * yOffset_ +
diff --git a/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h
index 63d8da5b4510e09ed10ebbfb73e0d761f4ae407d..e074e9737172f56047c18eae864f9d775985b102 100644
--- a/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h
+++ b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h
@@ -195,8 +195,7 @@ std::function<void()>  BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, Bo
 
 
 template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
-void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::ghostLayerPropagation(
-   Block * block)
+void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::ghostLayerPropagation(Block * block)
 {
    auto pdfField = block->getData<PdfField_T>(pdfFieldId_);
 
diff --git a/src/vtk/VTKOutput.cpp b/src/vtk/VTKOutput.cpp
index 61df10854dfd150eeca85b070c8caac5c1d89318..24f2c0e26bfc3284b0dd6932fe27ed062fd1be41 100644
--- a/src/vtk/VTKOutput.cpp
+++ b/src/vtk/VTKOutput.cpp
@@ -1759,17 +1759,15 @@ void VTKOutput::writeCollectors( const bool barrier )
       WALBERLA_MPI_WORLD_BARRIER();
 
    WALBERLA_NON_ROOT_SECTION() { return; }
-
    WALBERLA_ASSERT_EQUAL( MPIManager::instance()->worldRank(), 0 );
 
-
+   if(!amrFileFormat_)
+      writePVD();
 
    for( auto collector = collectorsToWrite_.begin(); collector != collectorsToWrite_.end(); ++collector )
    {
       if( uniformGrid_ )
       {
-         writePVD();
-
          if( samplingDx_ <= real_c(0) || samplingDy_ <= real_c(0) || samplingDz_ <= real_c(0) )
             writePVTI( *collector );
          else
@@ -1782,7 +1780,6 @@ void VTKOutput::writeCollectors( const bool barrier )
       }
       else
       {
-         writePVD();
          writePVTU( *collector ); // also applies for outputDomainDecomposition_ == true and pointDataSource_ != NULL
                                   // and polylineDataSource_ != NULL (uniformGrid_ will be false)
       }
@@ -1987,15 +1984,6 @@ void VTKOutput::writeVTHB( const uint_t collector ) const
       ofs << "  <Block level=\"" << level << "\">\n";
       uint index = 0;
       for( auto file = files[level].begin(); file != files[level].end(); ++file ){
-         std::ifstream ifs( file->string().c_str() );
-
-         std::string piece;
-         for( uint_t i = 0; i != 4; ++i )
-            std::getline( ifs, piece );
-         ifs.close();
-
-         piece.erase( piece.length()-1, 1 );
-
          ofs << "   <DataSet index=\"" << index << "\" file=\"" << executionFolder_ << "_" << collector << "/" << file->filename().string() << "\"/>\n";
          index++;
       }
diff --git a/tests/core/CMakeLists.txt b/tests/core/CMakeLists.txt
index 8d3f0298ac3bf387dfb19d18b46bfaff8caa6912..604de6371b573b350638ea4fb4f003d93960cb7c 100644
--- a/tests/core/CMakeLists.txt
+++ b/tests/core/CMakeLists.txt
@@ -119,6 +119,11 @@ waLBerla_execute_test( NAME SetReductionTest4  COMMAND $<TARGET_FILE:SetReductio
 waLBerla_execute_test( NAME SetReductionTest5  COMMAND $<TARGET_FILE:SetReductionTest> PROCESSES 5 )
 waLBerla_execute_test( NAME SetReductionTest27 COMMAND $<TARGET_FILE:SetReductionTest> PROCESSES 27 )
 
+if ( WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT )
+   waLBerla_compile_test( Name MPIFloat16Test FILES mpi/MPIFloat16Test.cpp DEPENDS core )
+   target_compile_features( MPIFloat16Test PUBLIC cxx_std_23 )
+   waLBerla_execute_test( NAME MPIFloat16Test4 COMMAND $<TARGET_FILE:MPIFloat16Test> PROCESSES 4)
+endif ()
 
 
 ##############
@@ -172,9 +177,6 @@ waLBerla_compile_test( FILES DebugSTLTest.cpp )
 waLBerla_execute_test( NAME DebugSTLTest )
 set_tests_properties(DebugSTLTest PROPERTIES WILL_FAIL TRUE)
 
-waLBerla_compile_test( FILES FP16Test.cpp )
-waLBerla_execute_test( NAME FP16Test )
-
 waLBerla_compile_test( FILES FunctionTraitsTest.cpp )
 waLBerla_execute_test( NAME FunctionTraitsTest )
 
@@ -235,4 +237,7 @@ if ( WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT )
    # Which features are actually supported depend on the compiler
    target_compile_features( Float16SupportTest PUBLIC cxx_std_23 )
    waLBerla_execute_test(NAME Float16SupportTest)
+
+   waLBerla_compile_test( FILES FP16Test.cpp )
+   waLBerla_execute_test( NAME FP16Test )
 endif ()
\ No newline at end of file
diff --git a/tests/core/FP16Test.cpp b/tests/core/FP16Test.cpp
index 60a2be0eeee0872449f6a648fa1c65abbbda7f42..e08dd55b099c8edc36f97ab4f86a613d6671d1ac 100644
--- a/tests/core/FP16Test.cpp
+++ b/tests/core/FP16Test.cpp
@@ -70,7 +70,7 @@ void fp16Test( int argc, char ** argv )
    const float16 y = -1.8f16;
    const float64 z = -0.6;
    WALBERLA_LOG_INFO_ON_ROOT("   + " << (double) x << " + " << (double) y << " == " << (float64) (x + y) << " ? ")
-   WALBERLA_CHECK_FLOAT_EQUAL((float64) (x + y), z, "float16 addition does not work correctly.");
+   WALBERLA_CHECK_FLOAT_EQUAL( (x + y), (float16) z, "float16 addition does not work correctly.");
 #endif
 }
 
diff --git a/tests/core/Float16SupportTest.cpp b/tests/core/Float16SupportTest.cpp
index 04ea9378f54eee4ee78f81177fc609d732da21c5..5116886ff4e154cecb21465acc39e40caea776dc 100644
--- a/tests/core/Float16SupportTest.cpp
+++ b/tests/core/Float16SupportTest.cpp
@@ -19,14 +19,16 @@
 //
 //======================================================================================================================
 
-#include <memory>
-#include <numeric>
-
 #include "core/DataTypes.h"
 #include "core/Environment.h"
 #include "core/logging/Logging.h"
 
+#include <numeric>
+
 namespace walberla::simple_Float16_test {
+
+
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
 using walberla::floatIsEqual;
 using walberla::real_t;
 using walberla::uint_c;
@@ -90,12 +92,12 @@ void vector_test()
    fpDst_cast.assign( 10, (dst_t) 1.5 );
    fp32.assign( 10, 1.5f );
    std::copy( fpSrc.begin(), fpSrc.end(), fpDst.begin() );
-   WALBERLA_LOG_WARNING_ON_ROOT(
+   WALBERLA_LOG_INFO_ON_ROOT(
        " std::vector.assign is not able to assign "
        << typeid( src_t ).name() << " values to container of type " << precisionType << ".\n"
        << " Therefore, the floating point value for assign must be cast beforehand or std::copy must be used, since it uses a static_cast internally." );
 
-   fpSrc[5]      = 2.3;
+   fpSrc[5]      = real_c(2.3);
    fpDst_cast[5] = (dst_t) 2.3;
    fp32[5]       = 2.3f;
    fpDst[5]      = (dst_t) 2.3;
@@ -118,7 +120,7 @@ void vector_test()
       WALBERLA_CHECK_FLOAT_EQUAL( (dst_t)sumSrc, sumDst );
    }
    {
-      fpSrc.assign( 13, 1.3 );
+      fpSrc.assign(13, real_c(1.3));
       std::copy( fpSrc.begin(), fpSrc.end(), fpDst.begin() );
       const auto sumSrc = std::reduce(fpSrc.begin(), fpSrc.end());
       const auto sumDst = std::reduce(fpDst.begin(), fpDst.end());
@@ -126,8 +128,11 @@ void vector_test()
    }
 } // simple_Float16_test::vector_test()
 
+#endif // WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+
 int main( int argc, char** argv )
 {
+#ifdef  WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
    // This check only works since C++23 and is used in many implementations, so it's important, that it works.
    WALBERLA_CHECK( std::is_arithmetic< dst_t >::value );
 
@@ -149,15 +154,17 @@ int main( int argc, char** argv )
    WALBERLA_LOG_INFO_ON_ROOT( " Start a where float32 is sufficient but float16 is not." );
    WALBERLA_CHECK_FLOAT_UNEQUAL( dst_t(1.0)-dst_t(0.3), 1.0-0.3 );
    WALBERLA_CHECK_FLOAT_EQUAL( 1.0f-0.3f, 1.0-0.3 );
+#else
+   WALBERLA_LOG_WARNING_ON_ROOT( "\nWALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT is not enabled. So this test cannot be run!\n" )
+#endif // WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
 
-   return 0;
+   return EXIT_SUCCESS;
 } // simple_Float16_test::main()
 
 } // namespace walberla::simple_Float16_test
 
 int main( int argc, char** argv )
 {
-   walberla::simple_Float16_test::main( argc, argv );
+   return walberla::simple_Float16_test::main( argc, argv );
 
-   return EXIT_SUCCESS;
 } // main()
diff --git a/tests/core/mpi/MPIFloat16Test.cpp b/tests/core/mpi/MPIFloat16Test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..be0bc5aa4f86eacc22f223c165f1dd943d5dbd56
--- /dev/null
+++ b/tests/core/mpi/MPIFloat16Test.cpp
@@ -0,0 +1,162 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file MPIFloat16.cpp
+//! \ingroup core
+//! \author Michael Zikeli <michael.zikeli@fau.de>
+//! \brief This test is to check whether the self defined MPI_Datatype and the self defined Operators are working.
+//!    To verify the type, some parts of the BufferSystemTest are just copied.
+//!    To verify the operations, a simple AllReduce is executed for each operation.
+//!        For now only { SUM, MIN, MAX } are implemented, thus only those are tested.
+//
+//======================================================================================================================
+
+#include "core/Abort.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Logging.h"
+#include "core/mpi/BufferSystem.h"
+#include "core/mpi/Environment.h"
+#include "core/mpi/Reduce.h"
+
+
+namespace walberla::mpifloat16test
+{
+
+using mpi::BufferSystem;
+using namespace std::literals::chrono_literals;
+
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+void symmetricCommunication()
+{
+   const int MSG_SIZE = 10;
+
+   auto mpiManager = MPIManager::instance();
+
+   const int numProcesses  = mpiManager->numProcesses();
+   const int rank          = mpiManager->worldRank();
+   const int leftNeighbor  = (rank-1+numProcesses)  % numProcesses;
+   const int rightNeighbor = (rank+1) % numProcesses;
+
+   WALBERLA_CHECK_GREATER_EQUAL( numProcesses, 3 );
+
+
+   BufferSystem bs ( MPI_COMM_WORLD );
+
+   // Pack Message to left neighbor containing own rank
+   for( int i=0; i< MSG_SIZE; ++i )
+      bs.sendBuffer( leftNeighbor )  << numeric_cast<float16>(rank) + float16(0.3);
+
+   // Pack Message to right neighbor containing own rank
+   for( int i=0; i< MSG_SIZE; ++i )
+      bs.sendBuffer( rightNeighbor ) << numeric_cast<float16>(rank) - float16(0.3);
+
+   bs.setReceiverInfoFromSendBufferState( true, false );
+   bs.sendAll();
+
+   for( auto it = bs.begin(); it != bs.end(); ++it )
+   {
+      WALBERLA_CHECK ( it.rank() == leftNeighbor || it.rank() == rightNeighbor );
+      WALBERLA_CHECK_EQUAL( it.buffer().size(), MSG_SIZE * sizeof(float16) + MSG_SIZE * mpi::BUFFER_DEBUG_OVERHEAD );
+
+      auto receivedVal = float16(-1);
+      it.buffer() >> receivedVal;
+
+      WALBERLA_CHECK_EQUAL( typeid(receivedVal), typeid(float16) );
+
+      if ( it.rank() == leftNeighbor )
+      {
+         WALBERLA_CHECK_FLOAT_EQUAL( receivedVal, numeric_cast<float16>( it.rank() ) - float16(0.3) );
+         WALBERLA_CHECK_FLOAT_UNEQUAL( receivedVal, numeric_cast<float16>( it.rank() ) + float16(0.3), 0.5);
+      } else {
+         WALBERLA_CHECK_FLOAT_EQUAL( receivedVal, numeric_cast<float16>( it.rank() ) + float16(0.3) );
+         WALBERLA_CHECK_FLOAT_UNEQUAL( receivedVal, numeric_cast<float16>( it.rank() ) - float16(0.3), 0.5);
+      }
+   }
+
+   WALBERLA_CHECK_EQUAL( bs.getBytesSent(), (MSG_SIZE * sizeof(float16) + MSG_SIZE * mpi::BUFFER_DEBUG_OVERHEAD) * 2 );
+   WALBERLA_CHECK_EQUAL( bs.getBytesReceived(), (MSG_SIZE * sizeof(float16) + MSG_SIZE * mpi::BUFFER_DEBUG_OVERHEAD) * 2 );
+}
+
+void reduce( )
+{
+
+   auto mpiManager = MPIManager::instance();
+
+   const int numProcesses  = mpiManager->numProcesses();
+   const int rank          = mpiManager->worldRank();
+
+   const auto startValue = numeric_cast<float16>(rank) + float16(0.3);
+
+   // SUM
+   auto value = startValue;
+
+   walberla::mpi::allReduceInplace( value, walberla::mpi::SUM );
+
+   auto sum = float16( 0.0 );
+   for( int i = 0; i < numProcesses; ++i )
+      sum += numeric_cast<float16>(i) + float16(0.3);
+   WALBERLA_CHECK_FLOAT_EQUAL( value, sum );
+   WALBERLA_CHECK_FLOAT_UNEQUAL( value, ((numProcesses*(numProcesses-1)/2.)+0.3*numProcesses), 1e-10 );
+
+   // MIN
+   value = startValue;
+
+   walberla::mpi::allReduceInplace( value, walberla::mpi::MIN );
+   WALBERLA_CHECK_FLOAT_EQUAL( value, numeric_cast<float16>(0.3) );
+
+   // MAX
+   value = startValue;
+
+   walberla::mpi::allReduceInplace( value, walberla::mpi::MAX );
+   WALBERLA_CHECK_FLOAT_EQUAL( value, numeric_cast<float16>(numProcesses-1)+numeric_cast<float16>(0.3) );
+
+}
+#endif // WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+
+int main( int argc, char** argv )
+{
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+   mpi::Environment mpiEnv( argc, argv );
+   debug::enterTestMode();
+   walberla::logging::Logging::instance()->setLogLevel( walberla::logging::Logging::INFO );
+
+   auto mpiManager   = MPIManager::instance();
+   auto numProcesses = mpiManager->numProcesses();
+
+   if(numProcesses <= 2)
+   {
+      WALBERLA_ABORT("This test has to be executed on at least 3 processes. Executed on " <<  numProcesses);
+      return EXIT_FAILURE;
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT("Testing Symmetric Communication...");
+   symmetricCommunication();
+
+   WALBERLA_LOG_INFO_ON_ROOT("Testing Reduce operations...");
+   reduce( );
+#else
+   WALBERLA_LOG_WARNING_ON_ROOT( "\nWALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT is not enabled. So this test cannot be run!\n" )
+#endif // WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+
+   return EXIT_SUCCESS;
+
+} // mpifloat16test::main()
+
+} // namespace walberla::mpifloat16test
+
+int main( int argc, char** argv )
+{
+   return walberla::mpifloat16test::main( argc, argv );
+} // main()
diff --git a/tests/lbm_generated/CMakeLists.txt b/tests/lbm_generated/CMakeLists.txt
index d7a7ef76bd6b9a605d19cd7e2bbdf4bedddbed7b..8ba33e735229aacb7a91f89f2f3912d8d1b61f85 100644
--- a/tests/lbm_generated/CMakeLists.txt
+++ b/tests/lbm_generated/CMakeLists.txt
@@ -6,8 +6,11 @@
 waLBerla_link_files_to_builddir( "*.prm" )
 waLBerla_link_files_to_builddir( "*.py" )
 
+if( WALBERLA_BUILD_WITH_CODEGEN )
+
 waLBerla_generate_target_from_python(NAME ExampleGenerated
         FILE Example.py
+        CODEGEN_CFG example_codegen
         OUT_FILES LBMStorageSpecification.h LBMStorageSpecification.cpp
         LBMSweepCollection.h LBMSweepCollection.cpp
         NoSlip.h NoSlip.cpp
@@ -16,6 +19,36 @@ waLBerla_generate_target_from_python(NAME ExampleGenerated
         Example_InfoHeader.h)
 waLBerla_compile_test( FILES Example.cpp DEPENDS ExampleGenerated blockforest field lbm_generated timeloop )
 
+waLBerla_generate_target_from_python(NAME InterpolationNoSlipGenerated
+        FILE InterpolationNoSlip.py
+        CODEGEN_CFG interpolation_no_slip_codegen
+        OUT_FILES InterpolationNoSlipStorageSpecification.h InterpolationNoSlipStorageSpecification.cpp
+        InterpolationNoSlipSweepCollection.h InterpolationNoSlipSweepCollection.cpp
+        NoSlip.h NoSlip.cpp
+        NoSlipBouzidi.h NoSlipBouzidi.cpp
+        NoSlipQuadraticBB.h NoSlipQuadraticBB.cpp
+        UBB.h UBB.cpp
+        InterpolationNoSlipBoundaryCollection.h
+        InterpolationNoSlipHeader.h)
+
+waLBerla_compile_test( FILES InterpolationNoSlip.cpp DEPENDS InterpolationNoSlipGenerated blockforest core field geometry lbm_generated timeloop )
+# waLBerla_execute_test( NAME InterpolationNoSlip1 COMMAND $<TARGET_FILE:InterpolationNoSlip> ${CMAKE_CURRENT_SOURCE_DIR}/InterpolationNoSlip.prm -Parameters.distanceWall=0.1 )
+# waLBerla_execute_test( NAME InterpolationNoSlip2 COMMAND $<TARGET_FILE:InterpolationNoSlip> ${CMAKE_CURRENT_SOURCE_DIR}/InterpolationNoSlip.prm -Parameters.distanceWall=0.5 )
+waLBerla_execute_test( NAME InterpolationNoSlip3 COMMAND $<TARGET_FILE:InterpolationNoSlip> ${CMAKE_CURRENT_SOURCE_DIR}/InterpolationNoSlip.prm )
+endif()
+
+waLBerla_generate_target_from_python(NAME FreeSlipRefinementGenerated
+        FILE FreeSlipRefinement.py
+        CODEGEN_CFG free_slip_refinement_codegen
+        OUT_FILES FreeSlipRefinementStorageSpecification.h FreeSlipRefinementStorageSpecification.cpp
+        FreeSlipRefinementSweepCollection.h FreeSlipRefinementSweepCollection.cpp
+        FreeSlip.h FreeSlip.cpp
+        UBB.h UBB.cpp
+        Outflow.h Outflow.cpp
+        FreeSlipRefinementBoundaryCollection.h
+        FreeSlipRefinementInfoHeader.h)
+waLBerla_compile_test( FILES FreeSlipRefinement.cpp DEPENDS FreeSlipRefinementGenerated blockforest field lbm_generated timeloop )
+
 if( WALBERLA_DOUBLE_ACCURACY )
 waLBerla_compile_test( FILES LDC.cpp DEPENDS blockforest field lbm_generated timeloop )
 endif()
diff --git a/tests/lbm_generated/FreeSlipRefinement.cpp b/tests/lbm_generated/FreeSlipRefinement.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..42d91a6eecbd9ad8e25a88c9a4dd4c70986d347e
--- /dev/null
+++ b/tests/lbm_generated/FreeSlipRefinement.cpp
@@ -0,0 +1,280 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FreeSlipRefinement.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//! \brief Channel flow with inlet and outlet on West and East. The rest of the Boundaries consist of FreeSlip. The
+//!        Channel flow should reach the inlet velocity in the whole domain because the FreeSlip BC will not provide a
+//!        restriction on it. With the D3Q27 stencil this only works if the BC is also set on the first fluid node
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/NonUniformBufferedScheme.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Initialization.h"
+#include "core/math/Vector3.h"
+#include "core/timing/RemainingTimeLogger.h"
+
+#include "field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/GhostLayerField.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/refinement/BasicRecursiveTimeStep.h"
+
+// include the generated header file. It includes all generated classes
+#include "FreeSlipRefinementInfoHeader.h"
+
+using namespace walberla;
+using namespace std::placeholders;
+
+using StorageSpecification_T = lbm::FreeSlipRefinementStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
+using PdfField_T             = lbm_generated::PdfField< StorageSpecification_T >;
+
+using SweepCollection_T = lbm::FreeSlipRefinementSweepCollection;
+
+using VectorField_T = GhostLayerField< real_t, StorageSpecification_T::Stencil::D >;
+using ScalarField_T = GhostLayerField< real_t, 1 >;
+
+using flag_t               = walberla::uint8_t;
+using FlagField_T          = FlagField< flag_t >;
+using BoundaryCollection_T = lbm::FreeSlipRefinementBoundaryCollection< FlagField_T >;
+
+using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction;
+
+class ChannelRefinement
+{
+ public:
+   ChannelRefinement(const uint_t depth) : refinementDepth_(depth){};
+
+   void operator()(SetupBlockForest& forest)
+   {
+      std::vector< SetupBlock* > blocks;
+      forest.getBlocks(blocks);
+      AABB refinementAABB = AABB(forest.getDomain().xSize() / 2 - 1, forest.getDomain().yMin(), forest.getDomain().zSize() / 2 - 1,
+                                 forest.getDomain().xSize() / 2 + 1, forest.getDomain().yMin() + 1, forest.getDomain().zSize() / 2 + 1 );
+
+      for (auto b : blocks)
+      {
+         if (refinementAABB.intersects(b->getAABB()) && b->getLevel() < refinementDepth_)
+         {
+            b->setMarker(true);
+         }
+      }
+   }
+
+ private:
+   const uint_t refinementDepth_;
+};
+
+class Channel
+{
+ public:
+   Channel(const uint_t depth) : refinementDepth_(depth), freeSlipFlagUID_("FreeSlip"), ubbFlagUID_("UBB"), outflowFlagUID_("Outflow"){};
+
+   RefinementSelectionFunctor refinementSelector() { return ChannelRefinement(refinementDepth_); }
+   void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID)
+   {
+      for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt)
+      {
+         Block& b           = dynamic_cast< Block& >(*bIt);
+         uint_t level       = b.getLevel();
+         auto flagField     = b.getData< FlagField_T >(flagFieldID);
+         uint8_t freeSlipFlag = flagField->registerFlag(freeSlipFlagUID_);
+         uint8_t ubbFlag    = flagField->registerFlag(ubbFlagUID_);
+         uint8_t outflowFlag    = flagField->registerFlag(outflowFlagUID_);
+         for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt)
+         {
+            Cell localCell = cIt.cell();
+            Cell globalCell(localCell);
+            sbfs.transformBlockLocalToGlobalCell(globalCell, b);
+            if (globalCell.x() < 0) { flagField->addFlag(localCell, ubbFlag); }
+            else if (globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level))) { flagField->addFlag(localCell, outflowFlag); }
+            else if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, freeSlipFlag); }
+            else if (globalCell.y() < cell_idx_c(1 << level)) {flagField->addFlag(localCell, freeSlipFlag);}
+         }
+      }
+   }
+
+ private:
+   const std::string refinementProfile_;
+   const uint_t refinementDepth_;
+
+   const FlagUID freeSlipFlagUID_;
+   const FlagUID ubbFlagUID_;
+   const FlagUID outflowFlagUID_;
+};
+
+static void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, Channel& setup)
+{
+   Vector3< real_t > domainSize = domainSetup.getParameter< Vector3< real_t > >("domainSize");
+   Vector3< uint_t > rootBlocks = domainSetup.getParameter< Vector3< uint_t > >("rootBlocks");
+   Vector3< bool > periodic     = domainSetup.getParameter< Vector3< bool > >("periodic");
+
+   auto refSelection = setup.refinementSelector();
+   setupBfs.addRefinementSelectionFunction(std::function< void(SetupBlockForest&) >(refSelection));
+   AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
+   setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
+   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalance(true), uint_c(MPIManager::instance()->numProcesses()));
+}
+
+int main(int argc, char** argv)
+{
+   walberla::Environment walberlaEnv(argc, argv);
+   mpi::MPIManager::instance()->useWorldComm();
+
+   logging::configureLogging(walberlaEnv.config());
+
+   // read parameters
+   auto domainSetup = walberlaEnv.config()->getOneBlock("DomainSetup");
+   auto parameters  = walberlaEnv.config()->getOneBlock("Parameters");
+
+   const real_t omega           = parameters.getParameter< real_t >("omega");
+   const real_t inletVelocity   = parameters.getParameter< real_t >("inletVelocity");
+   const uint_t timesteps       = parameters.getParameter< uint_t >("timesteps", uint_c(10)) + uint_c(1);
+   const uint_t refinementDepth = parameters.getParameter< uint_t >("refinementDepth", uint_c(1));
+
+   auto loggingParameters = walberlaEnv.config()->getOneBlock("Logging");
+   bool writeSetupForestAndReturn = loggingParameters.getParameter<bool>("writeSetupForestAndReturn", false);
+
+   auto remainingTimeLoggerFrequency =
+      parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
+
+   auto flowSetup = std::make_shared< Channel >(refinementDepth);
+
+   SetupBlockForest setupBfs;
+   WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
+   createSetupBlockForest(setupBfs, domainSetup, *flowSetup);
+
+   // Create structured block forest
+   Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+
+   WALBERLA_LOG_INFO_ON_ROOT("Creating structured block forest...")
+   auto bfs    = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+   auto blocks = std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
+   blocks->createCellBoundingBoxes();
+
+   if (writeSetupForestAndReturn)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Writing SetupBlockForest to VTK file")
+      WALBERLA_ROOT_SECTION() { vtk::writeDomainDecomposition(blocks, "FreeSlipRefinementDomainDecomposition", "vtk_out"); }
+
+      WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
+      for (uint_t level = 0; level <= refinementDepth; level++)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << setupBfs.getNumberOfBlocks(level))
+      }
+      WALBERLA_LOG_INFO_ON_ROOT("Ending program")
+      return EXIT_SUCCESS;
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
+   for (uint_t level = 0; level <= refinementDepth; level++)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << setupBfs.getNumberOfBlocks(level))
+   }
+
+   StorageSpecification_T StorageSpec = StorageSpecification_T();
+   BlockDataID pdfFieldId = lbm_generated::addPdfFieldToStorage(blocks, "pdf field", StorageSpec, uint_c(2));
+   BlockDataID velFieldId = field::addToStorage< VectorField_T >(blocks, "Velocity", real_c(0.0), field::fzyx);
+
+   BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field", uint_c(3));
+
+   SweepCollection_T sweepCollection(blocks, pdfFieldId, velFieldId, omega);
+   for (auto& block : *blocks)
+   {
+      sweepCollection.initialise(&block);
+   }
+
+   const FlagUID fluidFlagUID("Fluid");
+   flowSetup->setupBoundaryFlagField(*blocks, flagFieldId);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID, 2);
+   BoundaryCollection_T boundaryCollection(blocks, flagFieldId, pdfFieldId, fluidFlagUID, inletVelocity);
+
+   WALBERLA_LOG_INFO_ON_ROOT("Setting up communication...")
+   auto comm =
+      std::make_shared< blockforest::communication::NonUniformBufferedScheme< CommunicationStencil_T > >(blocks);
+   auto packInfo = lbm_generated::setupNonuniformPdfCommunication< PdfField_T >(blocks, pdfFieldId);
+   comm->addPackInfo(packInfo);
+
+   lbm_generated::BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T > timestep(
+      blocks, pdfFieldId, sweepCollection, boundaryCollection, comm, packInfo);
+
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+   uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+   if (vtkWriteFrequency > 0)
+   {
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "FreeSlipRefinementVTK", vtkWriteFrequency, 0, false, "vtk_out",
+                                                      "simulation_step", false, true, true, false, 0);
+
+      auto velWriter = make_shared< field::VTKWriter< VectorField_T > >(velFieldId, "velocity");
+      vtkOutput->addBeforeFunction([&]() {
+         for (auto& block : *blocks)
+         {
+            sweepCollection.calculateMacroscopicParameters(&block);
+         }
+      });
+
+      vtkOutput->addCellDataWriter(velWriter);
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+   timeloop.addFuncAfterTimeStep(timestep);
+
+   // log remaining time
+   if (remainingTimeLoggerFrequency > 1.0)
+   {
+      timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
+                                    "remaining time logger");
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT("Starting Simulation with " << timesteps << " timesteps")
+
+   timeloop.run();
+
+
+   for (auto& block : *blocks)
+   {
+      sweepCollection.calculateMacroscopicParameters(&block);
+      Block& b           = dynamic_cast< Block& >(block);
+      uint_t level       = b.getLevel();
+
+      auto velField  = b.getData< VectorField_T >(velFieldId);
+      for( auto it = velField->beginXYZ(); it != velField->end(); ++it )
+      {
+         Cell localCell = it.cell();
+         Cell globalCell(localCell);
+         blocks->transformBlockLocalToGlobalCell(globalCell, b);
+
+         if (globalCell.y() >= (cell_idx_c(1 << level)))
+         {
+            WALBERLA_CHECK_FLOAT_EQUAL_EPSILON(it.getF(0), inletVelocity, real_c(1e-5));
+         }
+      }
+   }
+   return EXIT_SUCCESS;
+}
diff --git a/tests/lbm_generated/FreeSlipRefinement.prm b/tests/lbm_generated/FreeSlipRefinement.prm
new file mode 100644
index 0000000000000000000000000000000000000000..9149c34d4ab19ff71af337e5be8ad80e2d1e714d
--- /dev/null
+++ b/tests/lbm_generated/FreeSlipRefinement.prm
@@ -0,0 +1,26 @@
+Parameters
+{
+	omega           1.95;
+	inletVelocity   0.05;
+	timesteps       1000;
+	refinementDepth 1;
+
+	remainingTimeLoggerFrequency 0; // in seconds
+	vtkWriteFrequency 0;
+}
+
+DomainSetup
+{
+   domainSize    <32, 16, 16>;
+   rootBlocks    <4, 2, 2>;
+
+   cellsPerBlock <  8, 8, 8 >;
+   periodic      <  0,    0, 1 >;
+}
+
+Logging
+{
+    logLevel info;  // info progress detail tracing
+    writeSetupForestAndReturn false;
+}
+
diff --git a/tests/lbm_generated/FreeSlipRefinement.py b/tests/lbm_generated/FreeSlipRefinement.py
new file mode 100644
index 0000000000000000000000000000000000000000..e695a8aedddea69bd9b8656de04eb87b76ce64fe
--- /dev/null
+++ b/tests/lbm_generated/FreeSlipRefinement.py
@@ -0,0 +1,49 @@
+import sympy as sp
+
+from pystencils import Target
+from pystencils import fields
+
+from lbmpy.advanced_streaming.utility import get_timesteps
+from lbmpy.boundaries import FreeSlip, UBB, ExtrapolationOutflow
+from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
+from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
+from pystencils_walberla import CodeGeneration, generate_info_header
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+
+with CodeGeneration() as ctx:
+    target = Target.CPU  # Target.GPU if ctx.cuda else Target.CPU
+    data_type = "float64" if ctx.double_accuracy else "float32"
+
+    streaming_pattern = 'pull'
+    timesteps = get_timesteps(streaming_pattern)
+
+    omega = sp.symbols("omega")
+
+    stencil = LBStencil(Stencil.D3Q27)
+    pdfs, vel_field = fields(f"pdfs({stencil.Q}), velocity({stencil.D}): {data_type}[{stencil.D}D]",
+                             layout='fzyx')
+
+    macroscopic_fields = {'velocity': vel_field}
+
+    lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega,
+                           streaming_pattern=streaming_pattern)
+    lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')
+
+    method = create_lb_method(lbm_config=lbm_config)
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+
+    freeslip = lbm_boundary_generator("FreeSlip", flag_uid="FreeSlip", boundary_object=FreeSlip(stencil))
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB([sp.Symbol("u_x"), 0.0, 0.0], data_type=data_type))
+    outflow = lbm_boundary_generator(class_name='Outflow', flag_uid='Outflow',
+                                     boundary_object=ExtrapolationOutflow(stencil[4], method),
+                                     field_data_type=data_type)
+
+    generate_lbm_package(ctx, name="FreeSlipRefinement",
+                         collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=True, boundaries=[freeslip, ubb, outflow],
+                         macroscopic_fields=macroscopic_fields,
+                         data_type=data_type, pdfs_data_type=data_type)
+
+    generate_info_header(ctx, 'FreeSlipRefinementInfoHeader')
diff --git a/tests/lbm_generated/InterpolationNoSlip.cpp b/tests/lbm_generated/InterpolationNoSlip.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cfbe99fcc87ba52fab85461a65326148af85a5da
--- /dev/null
+++ b/tests/lbm_generated/InterpolationNoSlip.cpp
@@ -0,0 +1,191 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file InterpolatedNoSlip.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//! \brief Couette flow driven by a UBB BC in Nord and wall boundary in the South. Remaining directions are periodic
+//!        If Interpolation BC are used the distance of the southern wall can be controlled. The velocity in the
+//!        first fluid cell is checked and compared with the velocity obtained from a default NoSlip BC.
+//!        Depending on the set distance for the interpolation BCs the velocity is expected to be higher or lower
+//
+//======================================================================================================================
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Initialization.h"
+#include "core/math/Vector3.h"
+#include "core/timing/RemainingTimeLogger.h"
+
+#include "field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/GhostLayerField.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include "lbm_generated/communication/UniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+
+// include the generated header file. It includes all generated classes
+#include "InterpolationNoSlipHeader.h"
+
+using namespace walberla;
+using namespace std::placeholders;
+
+using StorageSpecification_T = lbm::InterpolationNoSlipStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
+using PdfField_T             = lbm_generated::PdfField< StorageSpecification_T >;
+using PackInfo_T             = lbm_generated::UniformGeneratedPdfPackInfo< PdfField_T >;
+
+using SweepCollection_T = lbm::InterpolationNoSlipSweepCollection;
+
+using VectorField_T = GhostLayerField< real_t, StorageSpecification_T::Stencil::D >;
+using ScalarField_T = GhostLayerField< real_t, 1 >;
+
+using flag_t               = walberla::uint8_t;
+using FlagField_T          = FlagField< flag_t >;
+using BoundaryCollection_T = lbm::InterpolationNoSlipBoundaryCollection< FlagField_T >;
+
+using blockforest::communication::UniformBufferedScheme;
+
+class wallDistance
+{
+ public:
+   wallDistance(const real_t q) : q_(q) {}
+
+   real_t operator()(const Cell& fluidCell, const Cell& boundaryCell, const shared_ptr< StructuredBlockForest >& SbF,
+                     IBlock& block) const;
+
+ private:
+   const real_t q_;
+}; // class wallDistance
+
+real_t wallDistance::operator()(const Cell& /*fluidCell*/, const Cell& /*boundaryCell*/,
+                                const shared_ptr< StructuredBlockForest >& /*SbF*/, IBlock& /*block*/) const
+{
+   return q_;
+}
+
+int main(int argc, char** argv)
+{
+   debug::enterTestMode();
+   walberla::Environment walberlaEnv(argc, argv);
+   logging::configureLogging(walberlaEnv.config());
+
+   auto blocks = blockforest::createUniformBlockGridFromConfig(walberlaEnv.config());
+
+   auto domainSetup                = walberlaEnv.config()->getOneBlock("DomainSetup");
+   Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+
+   // read parameters
+   auto parameters   = walberlaEnv.config()->getOneBlock("Parameters");
+   const real_t omega        = parameters.getParameter< real_t >("omega", real_c(1.4));
+   const real_t distanceWall = parameters.getParameter< real_t >("distanceWall", real_c(0.5));
+   const uint_t timesteps    = parameters.getParameter< uint_t >("timesteps", uint_c(10)) + uint_c(1);
+
+   WALBERLA_LOG_DEVEL_VAR(distanceWall)
+
+   auto remainingTimeLoggerFrequency =
+      parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
+
+   const StorageSpecification_T StorageSpec = StorageSpecification_T();
+   BlockDataID pdfFieldId = lbm_generated::addPdfFieldToStorage(blocks, "pdf field", StorageSpec, uint_c(1));
+   BlockDataID velFieldId = field::addToStorage< VectorField_T >(blocks, "Velocity", real_c(0.0), field::fzyx);
+
+   BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field", uint_c(1));
+
+   SweepCollection_T sweepCollection(blocks, pdfFieldId, velFieldId, omega);
+   for (auto& block : *blocks)
+   {
+      sweepCollection.initialise(&block);
+   }
+
+   const FlagUID fluidFlagUID("Fluid");
+   auto boundariesConfig = walberlaEnv.config()->getBlock("Boundaries");
+   geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, boundariesConfig);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID);
+
+   const wallDistance wallDistanceCallback{ distanceWall };
+   std::function< real_t(const Cell&, const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) >
+      wallDistanceFunctor = wallDistanceCallback;
+   // For the BoundaryCollection a funcotr to calculate the wall distance for the Bouzidi NoSlip and for the QuadraticBB
+   // have to be provided. In this test case we use the same function to calculate the wall distance
+   BoundaryCollection_T boundaryCollection(blocks, flagFieldId, pdfFieldId, fluidFlagUID, omega, wallDistanceFunctor,
+                                           wallDistanceFunctor);
+
+   auto packInfo = std::make_shared< lbm_generated::UniformGeneratedPdfPackInfo< PdfField_T > >(pdfFieldId);
+   UniformBufferedScheme< Stencil_T > communication(blocks);
+   communication.addPackInfo(packInfo);
+
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+   timeloop.add() << BeforeFunction(communication, "communication")
+                  << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL), "Boundary Conditions");
+   timeloop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");
+
+   uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+   if (vtkWriteFrequency > 0)
+   {
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "InterpolationNoSlipVTK", vtkWriteFrequency, 0, false,
+                                                      "vtk_out", "simulation_step", false, true, true, false, 0);
+
+      auto velWriter = make_shared< field::VTKWriter< VectorField_T > >(velFieldId, "velocity");
+      vtkOutput->addBeforeFunction([&]() {
+         for (auto& block : *blocks)
+         {
+            sweepCollection.calculateMacroscopicParameters(&block);
+         }
+      });
+
+      vtkOutput->addCellDataWriter(velWriter);
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+
+   if (remainingTimeLoggerFrequency > 0)
+   {
+      // log remaining time
+      timeloop.addFuncAfterTimeStep(
+         timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
+         "remaining time logger");
+   }
+
+   timeloop.run();
+
+   // This is the velocity at the wall, when a NoSlip BC is used. This is similar to using an interpolation BC with a
+   // wall distance of 0.5. This value can be obtained by either setting distanceWall to 0.5 in the Parameter file or
+   // specifying the NoSlip BC at the southern boundary
+   const real_t defaultNoSlipVelocity = real_c(0.0002);
+
+   for (auto& block : *blocks)
+   {
+      sweepCollection.calculateMacroscopicParameters(&block);
+
+      auto velField  = block.getData< VectorField_T >(velFieldId);
+      auto velAtWall = velField->get(cell_idx_c(cellsPerBlock[0] / 2), 0, cell_idx_c(cellsPerBlock[2] / 2), 0);
+      // WALBERLA_LOG_DEVEL_VAR(velAtWall)
+
+      if (distanceWall > 0.49 && distanceWall < 0.51) { WALBERLA_CHECK_FLOAT_EQUAL(velAtWall, defaultNoSlipVelocity) }
+      else if (distanceWall < 0.49) { WALBERLA_CHECK_GREATER(defaultNoSlipVelocity, velAtWall) }
+      else if (distanceWall > 0.51) { WALBERLA_CHECK_LESS(defaultNoSlipVelocity, velAtWall) }
+   }
+
+   return EXIT_SUCCESS;
+}
diff --git a/tests/lbm_generated/InterpolationNoSlip.prm b/tests/lbm_generated/InterpolationNoSlip.prm
new file mode 100644
index 0000000000000000000000000000000000000000..4b15df27eb89235af707a5209abb37c31da50c61
--- /dev/null
+++ b/tests/lbm_generated/InterpolationNoSlip.prm
@@ -0,0 +1,30 @@
+Parameters
+{
+	omega           1.1;
+	timesteps       5000;
+    distanceWall    0.9;
+
+	remainingTimeLoggerFrequency 0; // in seconds
+	vtkWriteFrequency 0;
+}
+
+DomainSetup
+{
+    blocks        <  1,  1,  1 >;
+    cellsPerBlock < 50, 25, 25 >;
+    periodic      <  1,  0,  1 >;
+}
+
+Boundaries 
+{
+    // Border { direction S;    walldistance -1;  flag NoSlip; }
+    // Border { direction S;    walldistance -1;  flag NoSlipBouzidi; }
+    Border { direction S;    walldistance -1;  flag NoSlipQuadraticBB; }
+    Border { direction N;    walldistance -1;  flag UBB; }
+}
+
+
+Logging
+{
+    logLevel info;  // info progress detail tracing
+}
diff --git a/tests/lbm_generated/InterpolationNoSlip.py b/tests/lbm_generated/InterpolationNoSlip.py
new file mode 100644
index 0000000000000000000000000000000000000000..62463033edaab74efff765a7e412a273b44bccbf
--- /dev/null
+++ b/tests/lbm_generated/InterpolationNoSlip.py
@@ -0,0 +1,53 @@
+import sympy as sp
+
+from pystencils import Target
+from pystencils import fields
+
+from lbmpy.advanced_streaming.utility import get_timesteps
+from lbmpy.boundaries import NoSlip, NoSlipLinearBouzidi, QuadraticBounceBack, UBB
+from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
+from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
+from pystencils_walberla import CodeGeneration, generate_info_header
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+
+import warnings
+
+warnings.filterwarnings("ignore")
+with CodeGeneration() as ctx:
+    target = Target.CPU  # Target.GPU if ctx.cuda else Target.CPU
+    data_type = "float64" if ctx.double_accuracy else "float32"
+
+    streaming_pattern = 'pull'
+    timesteps = get_timesteps(streaming_pattern)
+
+    omega = sp.symbols("omega")
+
+    stencil = LBStencil(Stencil.D3Q27)
+    pdfs, vel_field = fields(f"pdfs({stencil.Q}), velocity({stencil.D}): {data_type}[{stencil.D}D]",
+                             layout='fzyx')
+
+    macroscopic_fields = {'velocity': vel_field}
+
+    lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega,
+                           streaming_pattern=streaming_pattern)
+    lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')
+
+    method = create_lb_method(lbm_config=lbm_config)
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
+                                     boundary_object=NoSlip())
+    no_slip_bouzidi = lbm_boundary_generator(class_name='NoSlipBouzidi', flag_uid='NoSlipBouzidi',
+                                             boundary_object=NoSlipLinearBouzidi(data_type=data_type))
+    no_slip_quadraticbb = lbm_boundary_generator(class_name='NoSlipQuadraticBB', flag_uid='NoSlipQuadraticBB',
+                                                 boundary_object=QuadraticBounceBack(omega, data_type=data_type))
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB([0.01, 0, 0], data_type=data_type))
+
+    generate_lbm_package(ctx, name="InterpolationNoSlip",
+                         collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=True, boundaries=[no_slip, no_slip_bouzidi, no_slip_quadraticbb, ubb],
+                         macroscopic_fields=macroscopic_fields, data_type=data_type)
+
+    generate_info_header(ctx, 'InterpolationNoSlipHeader')